12 years ago · 349979d55c
--- a/ChangeLog
+++ b/ChangeLog
@@ -94,11 +94,16 @@ New features:
 
				     STARPU_LIMIT_OPENCL_devid_MEM to limit memory per specific device
			
 
				   * Introduce new variable STARPU_LIMIT_CPU_MEM to limit memory for
			
 
				     the CPU devices
			
 
				-  * Define new functions starpu_malloc_count and starpu_free_count to
			
 
				-    be used for allocating memory up to the limits defined by the
			
 
				-    environment variables STARPU_LIMIT_xxx (see above). When no memory
			
 
				-    is left, starpu_malloc_count tries to reclaim memory from StarPU
			
 
				-    and returns -ENOMEM on failure.
			
 
				+  * New function starpu_malloc_flags to define a memory allocation with
			
 
				+    constraints based on the following values:
			
 
				+    - STARPU_MALLOC_PINNED specifies memory should be pinned
			
 
				+    - STARPU_MALLOC_COUNT specifies the memory allocation should be in
			
 
				+      the limits defined by the environment variables STARPU_LIMIT_xxx
			
 
				+      (see above). When no memory is left, starpu_malloc_flag tries
			
 
				+      to reclaim memory from StarPU and returns -ENOMEM on failure.
			
 
				+  * starpu_malloc calls starpu_malloc_flags with a value of flag set
			
 
				+    to STARPU_MALLOC_PINNED
			
 
				+  * Define new function starpu_free_flags similarly to starpu_malloc_flags
			
 
				 
			
 
				 Small features:
			
 
				   * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -239,11 +239,25 @@ are disabled.
 
				 @node Standard memory library
			
 
				 @section Standard memory library
			
 
				 
			
 
				-@deftypefun int starpu_malloc (void **@var{A}, size_t @var{dim})
			
 
				-This function allocates data of the given size in main memory. It will also try to pin it in
			
 
				-CUDA or OpenCL, so that data transfers from this buffer can be asynchronous, and
			
 
				-thus permit data transfer and computation overlapping. The allocated buffer must
			
 
				-be freed thanks to the @code{starpu_free} function.
			
 
				+@defmac STARPU_MALLOC_PINNED
			
 
				+Value passed to the function @code{starpu_malloc_flags} to
			
 
				+indicate the memory allocation should be pinned.
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_MALLOC_COUNT
			
 
				+Value passed to the function @code{starpu_malloc_flags} to
			
 
				+indicate the memory allocation should be in the limit defined by
			
 
				+the environment variables @code{STARPU_LIMIT_CUDA_devid_MEM},
			
 
				+@code{STARPU_LIMIT_CUDA_MEM}, @code{STARPU_LIMIT_OPENCL_devid_MEM},
			
 
				+@code{STARPU_LIMIT_OPENCL_MEM} and @code{STARPU_LIMIT_CPU_MEM}
			
 
				+(@pxref{Limit memory}). If no memory is available, it tries to reclaim
			
 
				+memory from StarPU. Memory allocated this way needs to be freed by
			
 
				+calling the @code{starpu_free_flags} function with the same flag.
			
 
				+@end defmac
			
 
				+
			
 
				+@deftypefun int starpu_malloc_flags (void **@var{A}, size_t @var{dim}, int @var{flags})
			
 
				+Performs a memory allocation based on the constraints defined by the
			
 
				+given @var{flag}.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_malloc_set_align (size_t @var{align})
			
@@ -252,25 +266,22 @@ allocations. @var{align} must be a power of two. This is for instance called
 
				 automatically by the OpenCL driver to specify its own alignment constraints.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun int starpu_free (void *@var{A})
			
 
				-This function frees memory which has previously allocated with
			
 
				-@code{starpu_malloc}.
			
 
				+@deftypefun int starpu_malloc (void **@var{A}, size_t @var{dim})
			
 
				+This function allocates data of the given size in main memory. It will also try to pin it in
			
 
				+CUDA or OpenCL, so that data transfers from this buffer can be asynchronous, and
			
 
				+thus permit data transfer and computation overlapping. The allocated buffer must
			
 
				+be freed thanks to the @code{starpu_free} function.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun int starpu_malloc_count (void **@var{A}, size_t @var{dim})
			
 
				-This function is similar to @code{starpu_malloc}. It only allocates
			
 
				-memory up to the limit defined by the environment variables
			
 
				-@code{STARPU_LIMIT_CUDA_devid_MEM}, @code{STARPU_LIMIT_CUDA_MEM},
			
 
				-@code{STARPU_LIMIT_OPENCL_devid_MEM}, @code{STARPU_LIMIT_OPENCL_MEM}
			
 
				-and @code{STARPU_LIMIT_CPU_MEM} (@pxref{Limit memory}). If no memory
			
 
				-is available, it tries to reclaim memory from StarPU.
			
 
				-Memory allocated through this function needs to be freed thanks to the
			
 
				-@code{starpu_free_count} function.
			
 
				+@deftypefun int starpu_free (void *@var{A})
			
 
				+This function frees memory which has previously been allocated with
			
 
				+@code{starpu_malloc}.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun int starpu_free_count (void *@var{A}, size_t @var{dim})
			
 
				-This function frees memory which has previously allocated with
			
 
				-@code{starpu_malloc_count}.
			
 
				+@deftypefun int starpu_free_flags (void *@var{A}, size_t @var{dim}, int @var{flags})
			
 
				+This function frees memory by specifying its size. The given
			
 
				+@var{flags} should be consistent with the ones given to
			
 
				+@code{starpu_malloc_flags} when allocating the memory.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @node Workers' Properties
			
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -42,11 +42,11 @@
 
				 
			
 
				 #define EPSILON 1e-6
			
 
				 
			
 
				-TYPE *vec_x, *vec_y;
			
 
				-TYPE alpha = 3.41;
			
 
				+TYPE *_vec_x, *_vec_y;
			
 
				+TYPE _alpha = 3.41;
			
 
				 
			
 
				 /* descriptors for StarPU */
			
 
				-starpu_data_handle_t handle_y, handle_x;
			
 
				+starpu_data_handle_t _handle_y, _handle_x;
			
 
				 
			
 
				 void axpy_cpu(void *descr[], __attribute__((unused)) void *arg)
			
 
				 {
			
@@ -98,9 +98,9 @@ check(void)
 
				 	int i;
			
 
				 	for (i = 0; i < N; i++)
			
 
				 	{
			
 
				-		TYPE expected_value = alpha * vec_x[i] + 4.0;
			
 
				-		if (fabs(vec_y[i] - expected_value) > expected_value * EPSILON) {
			
 
				-			FPRINTF(stderr,"at %d, %f*%f+%f=%f, expected %f\n", i, alpha, vec_x[i], 4.0, vec_y[i], expected_value);
			
 
				+		TYPE expected_value = _alpha * _vec_x[i] + 4.0;
			
 
				+		if (fabs(_vec_y[i] - expected_value) > expected_value * EPSILON) {
			
 
				+			FPRINTF(stderr,"at %d, %f*%f+%f=%f, expected %f\n", i, _alpha, _vec_x[i], 4.0, _vec_y[i], expected_value);
			
 
				 			return EXIT_FAILURE;
			
 
				 		}
			
 
				 	}
			
@@ -134,25 +134,25 @@ int main(int argc, char **argv)
 
				 		vec_a = malloc(N*sizeof(TYPE));
			
 
				 		vec_b = malloc(N*sizeof(TYPE));
			
 
				 	*/
			
 
				-	starpu_malloc((void **)&vec_x, N*sizeof(TYPE));
			
 
				-	assert(vec_x);
			
 
				+	starpu_malloc((void **)&_vec_x, N*sizeof(TYPE));
			
 
				+	assert(_vec_x);
			
 
				 
			
 
				-	starpu_malloc((void **)&vec_y, N*sizeof(TYPE));
			
 
				-	assert(vec_y);
			
 
				+	starpu_malloc((void **)&_vec_y, N*sizeof(TYPE));
			
 
				+	assert(_vec_y);
			
 
				 
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < N; i++)
			
 
				 	{
			
 
				-		vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */
			
 
				-		vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */
			
 
				+		_vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */
			
 
				+		_vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */
			
 
				 	}
			
 
				 
			
 
				-	FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
			
 
				-	FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
			
 
				+	FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", _vec_x[0]);
			
 
				+	FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", _vec_y[0]);
			
 
				 
			
 
				 	/* Declare the data to StarPU */
			
 
				-	starpu_vector_data_register(&handle_x, 0, (uintptr_t)vec_x, N, sizeof(TYPE));
			
 
				-	starpu_vector_data_register(&handle_y, 0, (uintptr_t)vec_y, N, sizeof(TYPE));
			
 
				+	starpu_vector_data_register(&_handle_x, 0, (uintptr_t)_vec_x, N, sizeof(TYPE));
			
 
				+	starpu_vector_data_register(&_handle_y, 0, (uintptr_t)_vec_y, N, sizeof(TYPE));
			
 
				 
			
 
				 	/* Divide the vector into blocks */
			
 
				 	struct starpu_data_filter block_filter =
			
@@ -161,8 +161,8 @@ int main(int argc, char **argv)
 
				 		.nchildren = NBLOCKS
			
 
				 	};
			
 
				 
			
 
				-	starpu_data_partition(handle_x, &block_filter);
			
 
				-	starpu_data_partition(handle_y, &block_filter);
			
 
				+	starpu_data_partition(_handle_x, &block_filter);
			
 
				+	starpu_data_partition(_handle_y, &block_filter);
			
 
				 
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -176,10 +176,10 @@ int main(int argc, char **argv)
 
				 
			
 
				 		task->cl = &axpy_cl;
			
 
				 
			
 
				-		task->cl_arg = &alpha;
			
 
				+		task->cl_arg = &_alpha;
			
 
				 
			
 
				-		task->handles[0] = starpu_data_get_sub_data(handle_x, 1, b);
			
 
				-		task->handles[1] = starpu_data_get_sub_data(handle_y, 1, b);
			
 
				+		task->handles[0] = starpu_data_get_sub_data(_handle_x, 1, b);
			
 
				+		task->handles[1] = starpu_data_get_sub_data(_handle_y, 1, b);
			
 
				 
			
 
				 		ret = starpu_task_submit(task);
			
 
				 		if (ret == -ENODEV)
			
@@ -193,10 +193,10 @@ int main(int argc, char **argv)
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 enodev:
			
 
				-	starpu_data_unpartition(handle_x, 0);
			
 
				-	starpu_data_unpartition(handle_y, 0);
			
 
				-	starpu_data_unregister(handle_x);
			
 
				-	starpu_data_unregister(handle_y);
			
 
				+	starpu_data_unpartition(_handle_x, 0);
			
 
				+	starpu_data_unpartition(_handle_y, 0);
			
 
				+	starpu_data_unregister(_handle_x);
			
 
				+	starpu_data_unregister(_handle_y);
			
 
				 
			
 
				 	gettimeofday(&end, NULL);
			
 
				         double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
			
@@ -204,13 +204,13 @@ enodev:
 
				 
			
 
				 	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
			
 
				 
			
 
				-	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
			
 
				+	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[0], _alpha);
			
 
				 
			
 
				 	if (exit_value != 77)
			
 
				 		exit_value = check();
			
 
				 
			
 
				-	starpu_free((void *)vec_x);
			
 
				-	starpu_free((void *)vec_y);
			
 
				+	starpu_free((void *)_vec_x);
			
 
				+	starpu_free((void *)_vec_y);
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         ret = starpu_opencl_unload_opencl(&opencl_program);
			
--- a/examples/basic_examples/block_opencl.c
+++ b/examples/basic_examples/block_opencl.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,13 +17,13 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
			
 
				-do						    	    \
			
 
				-{							    \
			
 
				-	int err;                                            \
			
 
				-	err = clSetKernelArg(kernel, n, size, ptr);         \
			
 
				-	if (err != CL_SUCCESS)                              \
			
 
				-       		STARPU_OPENCL_REPORT_ERROR(err);            \
			
 
				+#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       	\
			
 
				+do						    		\
			
 
				+{								\
			
 
				+	int check_err;                                          \
			
 
				+	check_err = clSetKernelArg(kernel, n, size, ptr);       \
			
 
				+	if (check_err != CL_SUCCESS)                            \
			
 
				+       		STARPU_OPENCL_REPORT_ERROR(check_err);          \
			
 
				 } while (0)
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_code;
			
--- a/examples/filters/custom_mf/conversion_opencl.c
+++ b/examples/filters/custom_mf/conversion_opencl.c
@@ -18,7 +18,7 @@
 
				 #include "custom_types.h"
			
 
				 #include "custom_interface.h"
			
 
				 
			
 
				-extern struct starpu_opencl_program opencl_conversion_program;
			
 
				+extern struct starpu_opencl_program _opencl_conversion_program;
			
 
				 
			
 
				 void cpu_to_opencl_opencl_func(void *buffers[], void *args)
			
 
				 {
			
@@ -39,7 +39,7 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
				 
			
 
				 	err = starpu_opencl_load_kernel(&kernel,
			
 
				 					&queue,
			
 
				-					&opencl_conversion_program,
			
 
				+					&_opencl_conversion_program,
			
 
				 					"custom_opencl_conversion",
			
 
				 					devid);
			
 
				 	if (err != CL_SUCCESS)
			
--- a/examples/filters/custom_mf/custom_mf_filter.c
+++ b/examples/filters/custom_mf/custom_mf_filter.c
@@ -22,16 +22,16 @@
 
				 #define DEBUG 1
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static unsigned int ncuda;
			
 
				+static unsigned int _ncuda;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static unsigned int nopencl;
			
 
				+static unsigned int _nopencl;
			
 
				 #endif
			
 
				 
			
 
				 
			
 
				-static struct point array_of_structs[N];
			
 
				-static starpu_data_handle_t handle;
			
 
				-static unsigned int nchunks = 6;
			
 
				+static struct point _array_of_structs[N];
			
 
				+static starpu_data_handle_t _handle;
			
 
				+static unsigned int _nchunks = 6;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 extern struct starpu_codelet cpu_to_cuda_cl;
			
@@ -107,26 +107,26 @@ register_and_partition_data(void)
 
				 	int i;
			
 
				 	for (i = 0; i < N; i++)
			
 
				 	{
			
 
				-		array_of_structs[i].x = i+1.0;
			
 
				-		array_of_structs[i].y = 42.0;
			
 
				+		_array_of_structs[i].x = i+1.0;
			
 
				+		_array_of_structs[i].y = 42.0;
			
 
				 	}
			
 
				-	custom_data_register(&handle, 0, &array_of_structs, N, &format_ops);
			
 
				+	custom_data_register(&_handle, 0, &_array_of_structs, N, &format_ops);
			
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				 		.filter_func   = custom_filter,
			
 
				-		.nchildren     = nchunks,
			
 
				+		.nchildren     = _nchunks,
			
 
				 		.get_nchildren = NULL,
			
 
				 		.get_child_ops = NULL
			
 
				 	};
			
 
				-	starpu_data_partition(handle, &f);
			
 
				+	starpu_data_partition(_handle, &f);
			
 
				 }
			
 
				 
			
 
				 static void
			
 
				 unpartition_and_unregister_data(void)
			
 
				 {
			
 
				-	starpu_data_unpartition(handle, 0);
			
 
				-	starpu_data_unregister(handle);
			
 
				+	starpu_data_unpartition(_handle, 0);
			
 
				+	starpu_data_unregister(_handle);
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -181,7 +181,7 @@ create_and_submit_tasks(void)
 
				 {
			
 
				 	int err;
			
 
				 	unsigned int i;
			
 
				-	for (i = 0; i < nchunks; i++)
			
 
				+	for (i = 0; i < _nchunks; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 		switch (i%3)
			
@@ -191,7 +191,7 @@ create_and_submit_tasks(void)
 
				 			break;
			
 
				 		case 1:
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-			if (ncuda > 0)
			
 
				+			if (_ncuda > 0)
			
 
				 				task->cl = &cuda_cl;
			
 
				 			else
			
 
				 #endif
			
@@ -199,7 +199,7 @@ create_and_submit_tasks(void)
 
				 			break;
			
 
				 		case 2:
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-			if (nopencl > 0)
			
 
				+			if (_nopencl > 0)
			
 
				 				task->cl = &opencl_cl;
			
 
				 			else
			
 
				 #endif
			
@@ -210,7 +210,7 @@ create_and_submit_tasks(void)
 
				 			assert(0);
			
 
				 		}
			
 
				 
			
 
				-		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
			
 
				+		task->handles[0] = starpu_data_get_sub_data(_handle, 1, i);
			
 
				 		err = starpu_task_submit(task);
			
 
				 		if (err != 0)
			
 
				 			return err;
			
@@ -232,8 +232,8 @@ print_it(void)
 
				 	for (i = 0; i < N; i++)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "(%.2f, %.2f) ",
			
 
				-			array_of_structs[i].x,
			
 
				-			array_of_structs[i].y);
			
 
				+			_array_of_structs[i].x,
			
 
				+			_array_of_structs[i].y);
			
 
				 	}
			
 
				 	FPRINTF(stderr, "\n");
			
 
				 }
			
@@ -246,7 +246,7 @@ check_it(void)
 
				 	for (i = 0; i < N; i++)
			
 
				 	{
			
 
				 		float expected_value = (i + 1.0)*42.0;
			
 
				-		if (array_of_structs[i].x != expected_value)
			
 
				+		if (_array_of_structs[i].x != expected_value)
			
 
				 			return EXIT_FAILURE;
			
 
				 	}
			
 
				 
			
@@ -254,8 +254,8 @@ check_it(void)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-struct starpu_opencl_program opencl_program;
			
 
				-struct starpu_opencl_program opencl_conversion_program;
			
 
				+struct starpu_opencl_program _opencl_program;
			
 
				+struct starpu_opencl_program _opencl_conversion_program;
			
 
				 #endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				 int
			
@@ -271,20 +271,20 @@ main(void)
 
				 		goto enodev;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	ncuda = starpu_cuda_worker_get_count();
			
 
				+	_ncuda = starpu_cuda_worker_get_count();
			
 
				 #endif /* !STARPU_USE_CUDA */
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	nopencl = starpu_opencl_worker_get_count();
			
 
				-	if (nopencl > 0)
			
 
				+	_nopencl = starpu_opencl_worker_get_count();
			
 
				+	if (_nopencl > 0)
			
 
				 	{
			
 
				 		char *f1 = "examples/filters/custom_mf/custom_opencl.cl";
			
 
				 		char *f2 = "examples/filters/custom_mf/conversion_opencl.cl";
			
 
				-		err = starpu_opencl_load_opencl_from_file(f1, &opencl_program,
			
 
				+		err = starpu_opencl_load_opencl_from_file(f1, &_opencl_program,
			
 
				 							  NULL);
			
 
				 		assert(err == 0);
			
 
				 		err = starpu_opencl_load_opencl_from_file(f2,
			
 
				-						&opencl_conversion_program,
			
 
				-						NULL);
			
 
				+							  &_opencl_conversion_program,
			
 
				+							  NULL);
			
 
				 		assert(err == 0);
			
 
				 	}
			
 
				 #endif /* !STARPU_USE_OPENCL */
			
@@ -306,11 +306,11 @@ main(void)
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	if (nopencl > 0)
			
 
				+	if (_nopencl > 0)
			
 
				 	{
			
 
				-        	err = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+        	err = starpu_opencl_unload_opencl(&_opencl_program);
			
 
				 		assert(err == 0);
			
 
				-		err = starpu_opencl_unload_opencl(&opencl_conversion_program);
			
 
				+		err = starpu_opencl_unload_opencl(&_opencl_conversion_program);
			
 
				 		assert(err == 0);
			
 
				 	}
			
 
				 #endif /* !STARPU_USE_OPENCL */
			
--- a/examples/filters/custom_mf/custom_opencl.c
+++ b/examples/filters/custom_mf/custom_opencl.c
@@ -18,7 +18,7 @@
 
				 #include "custom_types.h"
			
 
				 #include "custom_interface.h"
			
 
				 
			
 
				-extern struct starpu_opencl_program opencl_program;
			
 
				+extern struct starpu_opencl_program _opencl_program;
			
 
				 
			
 
				 void custom_scal_opencl_func(void *buffers[], void *args)
			
 
				 {
			
@@ -38,7 +38,7 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 
				 
			
 
				 	err = starpu_opencl_load_kernel(&kernel,
			
 
				 					&queue,
			
 
				-					&opencl_program,
			
 
				+					&_opencl_program,
			
 
				 					"custom_scal_opencl",
			
 
				 					devid);
			
 
				 	if (err != CL_SUCCESS)
			
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -38,18 +38,18 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				 static void create_data(float **_nzvalA, float **_vecb, float **_vecx, uint32_t *_nnz, uint32_t *_nrow, uint32_t **_colind, uint32_t **_rowptr)
			
 
				 {
			
 
				 	/* we need a sparse symetric (definite positive ?) matrix and a "dense" vector */
			
 
				-	
			
 
				+
			
 
				 	/* example of 3-band matrix */
			
 
				 	float *nzval;
			
 
				 	uint32_t nnz;
			
 
				 	uint32_t *colind;
			
 
				 	uint32_t *rowptr;
			
 
				 
			
 
				-	nnz = 3*size-2;
			
 
				+	nnz = 3*_size-2;
			
 
				 
			
 
				 	nzval = malloc(nnz*sizeof(float));
			
 
				 	colind = malloc(nnz*sizeof(uint32_t));
			
 
				-	rowptr = malloc(size*sizeof(uint32_t));
			
 
				+	rowptr = malloc(_size*sizeof(uint32_t));
			
 
				 
			
 
				 	assert(nzval);
			
 
				 	assert(colind);
			
@@ -59,7 +59,7 @@ static void create_data(float **_nzvalA, float **_vecb, float **_vecx, uint32_t
 
				 	/* fill the matrix */
			
 
				 	unsigned row;
			
 
				 	unsigned pos = 0;
			
 
				-	for (row = 0; row < size; row++)
			
 
				+	for (row = 0; row < _size; row++)
			
 
				 	{
			
 
				 		rowptr[row] = pos;
			
 
				 
			
@@ -69,12 +69,12 @@ static void create_data(float **_nzvalA, float **_vecb, float **_vecx, uint32_t
 
				 			colind[pos] = row-1;
			
 
				 			pos++;
			
 
				 		}
			
 
				-		
			
 
				+
			
 
				 		nzval[pos] = 5.0f;
			
 
				 		colind[pos] = row;
			
 
				 		pos++;
			
 
				 
			
 
				-		if (row < size - 1)
			
 
				+		if (row < _size - 1)
			
 
				 		{
			
 
				 			nzval[pos] = 1.0f;
			
 
				 			colind[pos] = row+1;
			
@@ -83,24 +83,24 @@ static void create_data(float **_nzvalA, float **_vecb, float **_vecx, uint32_t
 
				 	}
			
 
				 
			
 
				 	*_nnz = nnz;
			
 
				-	*_nrow = size;
			
 
				+	*_nrow = _size;
			
 
				 	*_nzvalA = nzval;
			
 
				 	*_colind = colind;
			
 
				 	*_rowptr = rowptr;
			
 
				 
			
 
				 	STARPU_ASSERT(pos == nnz);
			
 
				-	
			
 
				+
			
 
				 	/* initiate the 2 vectors */
			
 
				 	float *invec, *outvec;
			
 
				-	invec = malloc(size*sizeof(float));
			
 
				+	invec = malloc(_size*sizeof(float));
			
 
				 	assert(invec);
			
 
				 
			
 
				-	outvec = malloc(size*sizeof(float));
			
 
				+	outvec = malloc(_size*sizeof(float));
			
 
				 	assert(outvec);
			
 
				 
			
 
				 	/* fill those */
			
 
				 	unsigned ind;
			
 
				-	for (ind = 0; ind < size; ind++)
			
 
				+	for (ind = 0; ind < _size; ind++)
			
 
				 	{
			
 
				 		invec[ind] = 2.0f;
			
 
				 		outvec[ind] = 0.0f;
			
@@ -127,10 +127,10 @@ void init_problem(void)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	cg initialization phase 
			
 
				+ *	cg initialization phase
			
 
				  */
			
 
				 
			
 
				-void init_cg(struct cg_problem *problem) 
			
 
				+void init_cg(struct cg_problem *problem)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
@@ -178,7 +178,7 @@ void init_cg(struct cg_problem *problem)
 
				 
			
 
				 	task3->callback_func = iteration_cg;
			
 
				 	task3->callback_arg = problem;
			
 
				-	
			
 
				+
			
 
				 	/* XXX 3 should only depend on 1 ... */
			
 
				 	starpu_tag_declare_deps((starpu_tag_t)3UL, 1, (starpu_tag_t)2UL);
			
 
				 
			
@@ -192,7 +192,7 @@ void init_cg(struct cg_problem *problem)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	the inner iteration of the cg algorithm 
			
 
				+ *	the inner iteration of the cg algorithm
			
 
				  *		the codelet code launcher is its own callback !
			
 
				  */
			
 
				 
			
@@ -301,7 +301,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 
			
 
				 	task9->callback_func = iteration_cg;
			
 
				 	task9->callback_arg = problem;
			
 
				-	
			
 
				+
			
 
				 	/* launch the computation now */
			
 
				 	ret = starpu_task_submit(task4);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
@@ -323,7 +323,7 @@ void iteration_cg(void *problem)
 
				 
			
 
				 	FPRINTF(stdout, "i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
			
 
				 
			
 
				-	if ((pb->i < MAXITER) && 
			
 
				+	if ((pb->i < MAXITER) &&
			
 
				 		(pb->delta_new > pb->epsilon) )
			
 
				 	{
			
 
				 		if (pb->i % 1000 == 0)
			
@@ -344,7 +344,7 @@ void iteration_cg(void *problem)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	initializing the problem 
			
 
				+ *	initializing the problem
			
 
				  */
			
 
				 
			
 
				 void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
			
@@ -354,10 +354,10 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 
				 
			
 
				 	starpu_data_handle_t ds_matrixA;
			
 
				 	starpu_data_handle_t ds_vecx, ds_vecb;
			
 
				-	starpu_data_handle_t ds_vecr, ds_vecd, ds_vecq; 
			
 
				+	starpu_data_handle_t ds_vecr, ds_vecd, ds_vecq;
			
 
				 
			
 
				 	/* first the user-allocated data */
			
 
				-	starpu_csr_data_register(&ds_matrixA, 0, nnz, nrow, 
			
 
				+	starpu_csr_data_register(&ds_matrixA, 0, nnz, nrow,
			
 
				 			(uintptr_t)nzvalA, colind, rowptr, 0, sizeof(float));
			
 
				 	starpu_vector_data_register(&ds_vecx, 0, (uintptr_t)vecx, nrow, sizeof(float));
			
 
				 	starpu_vector_data_register(&ds_vecb, 0, (uintptr_t)vecb, nrow, sizeof(float));
			
--- a/examples/heat/dw_sparse_cg.h
+++ b/examples/heat/dw_sparse_cg.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -40,10 +40,10 @@
 
				 #define EPSILON	0.0000001f
			
 
				 
			
 
				 /* code parameters */
			
 
				-static uint32_t size = 33554432;
			
 
				-static unsigned usecpu = 0;
			
 
				-static unsigned blocks = 512;
			
 
				-static unsigned grids  = 8;
			
 
				+static uint32_t _size = 33554432;
			
 
				+static unsigned _usecpu = 0;
			
 
				+static unsigned _blocks = 512;
			
 
				+static unsigned _grids  = 8;
			
 
				 
			
 
				 struct cg_problem
			
 
				 {
			
@@ -76,24 +76,24 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
				 		if (strcmp(argv[i], "-size") == 0)
			
 
				 		{
			
 
				 			char *argptr;
			
 
				-			size = strtol(argv[++i], &argptr, 10);
			
 
				+			_size = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(argv[i], "-block") == 0)
			
 
				 		{
			
 
				 			char *argptr;
			
 
				-			blocks = strtol(argv[++i], &argptr, 10);
			
 
				+			_blocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(argv[i], "-grid") == 0)
			
 
				 		{
			
 
				 			char *argptr;
			
 
				-			grids = strtol(argv[++i], &argptr, 10);
			
 
				+			_grids = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(argv[i], "-cpu") == 0)
			
 
				 		{
			
 
				-			usecpu = 1;
			
 
				+			_usecpu = 1;
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -28,19 +28,19 @@
 
				 
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				-static float *x;
			
 
				-static float *y;
			
 
				-static starpu_data_handle_t *x_handles;
			
 
				-static starpu_data_handle_t *y_handles;
			
 
				+static float *_x;
			
 
				+static float *_y;
			
 
				+static starpu_data_handle_t *_x_handles;
			
 
				+static starpu_data_handle_t *_y_handles;
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static struct starpu_opencl_program opencl_program;
			
 
				+static struct starpu_opencl_program _opencl_program;
			
 
				 #endif
			
 
				 
			
 
				-static unsigned nblocks = 4096;
			
 
				-static unsigned entries_per_block = 1024;
			
 
				+static unsigned _nblocks = 4096;
			
 
				+static unsigned _entries_per_block = 1024;
			
 
				 
			
 
				-static DOT_TYPE dot = 0.0f;
			
 
				-static starpu_data_handle_t dot_handle;
			
 
				+static DOT_TYPE _dot = 0.0f;
			
 
				+static starpu_data_handle_t _dot_handle;
			
 
				 
			
 
				 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
@@ -148,7 +148,7 @@ void redux_opencl_func(void *buffers[], void *args)
 
				 	id = starpu_worker_get_id();
			
 
				 	devid = starpu_worker_get_devid(id);
			
 
				 
			
 
				-	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_redux_opencl", devid);
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_redux_opencl", devid);
			
 
				 	if (err != CL_SUCCESS)
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -262,7 +262,7 @@ void dot_opencl_func(void *buffers[], void *args)
 
				 	id = starpu_worker_get_id();
			
 
				 	devid = starpu_worker_get_devid(id);
			
 
				 
			
 
				-	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_dot_opencl", devid);
			
 
				+	err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_dot_opencl", devid);
			
 
				 	if (err != CL_SUCCESS)
			
 
				 		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -329,22 +329,22 @@ int main(int argc, char **argv)
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
			
 
				-						  &opencl_program, NULL);
			
 
				+						  &_opencl_program, NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				 #endif
			
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				-	unsigned long nelems = nblocks*entries_per_block;
			
 
				+	unsigned long nelems = _nblocks*_entries_per_block;
			
 
				 	size_t size = nelems*sizeof(float);
			
 
				 
			
 
				-	x = (float *) malloc(size);
			
 
				-	y = (float *) malloc(size);
			
 
				+	_x = (float *) malloc(size);
			
 
				+	_y = (float *) malloc(size);
			
 
				 
			
 
				-	x_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				-	y_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				+	_x_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t));
			
 
				+	_y_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t));
			
 
				 
			
 
				-	assert(x && y);
			
 
				+	assert(_x && _y);
			
 
				 
			
 
				         starpu_srand48(0);
			
 
				 
			
@@ -353,67 +353,67 @@ int main(int argc, char **argv)
 
				 	unsigned long i;
			
 
				 	for (i = 0; i < nelems; i++)
			
 
				 	{
			
 
				-		x[i] = (float)starpu_drand48();
			
 
				-		y[i] = (float)starpu_drand48();
			
 
				+		_x[i] = (float)starpu_drand48();
			
 
				+		_y[i] = (float)starpu_drand48();
			
 
				 
			
 
				-		reference_dot += (DOT_TYPE)x[i]*(DOT_TYPE)y[i];
			
 
				+		reference_dot += (DOT_TYPE)_x[i]*(DOT_TYPE)_y[i];
			
 
				 	}
			
 
				 
			
 
				 	unsigned block;
			
 
				-	for (block = 0; block < nblocks; block++)
			
 
				+	for (block = 0; block < _nblocks; block++)
			
 
				 	{
			
 
				-		starpu_vector_data_register(&x_handles[block], 0,
			
 
				-			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
			
 
				-		starpu_vector_data_register(&y_handles[block], 0,
			
 
				-			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
			
 
				+		starpu_vector_data_register(&_x_handles[block], 0,
			
 
				+			(uintptr_t)&_x[_entries_per_block*block], _entries_per_block, sizeof(float));
			
 
				+		starpu_vector_data_register(&_y_handles[block], 0,
			
 
				+			(uintptr_t)&_y[_entries_per_block*block], _entries_per_block, sizeof(float));
			
 
				 	}
			
 
				 
			
 
				-	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
			
 
				+	starpu_variable_data_register(&_dot_handle, 0, (uintptr_t)&_dot, sizeof(DOT_TYPE));
			
 
				 
			
 
				 	/*
			
 
				 	 *	Compute dot product with StarPU
			
 
				 	 */
			
 
				-	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
			
 
				+	starpu_data_set_reduction_methods(_dot_handle, &redux_codelet, &init_codelet);
			
 
				 
			
 
				-	for (block = 0; block < nblocks; block++)
			
 
				+	for (block = 0; block < _nblocks; block++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				 		task->cl = &dot_codelet;
			
 
				 		task->destroy = 1;
			
 
				 
			
 
				-		task->handles[0] = x_handles[block];
			
 
				-		task->handles[1] = y_handles[block];
			
 
				-		task->handles[2] = dot_handle;
			
 
				+		task->handles[0] = _x_handles[block];
			
 
				+		task->handles[1] = _y_handles[block];
			
 
				+		task->handles[2] = _dot_handle;
			
 
				 
			
 
				 		ret = starpu_task_submit(task);
			
 
				 		if (ret == -ENODEV) goto enodev;
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 	}
			
 
				 
			
 
				-	for (block = 0; block < nblocks; block++)
			
 
				+	for (block = 0; block < _nblocks; block++)
			
 
				 	{
			
 
				-		starpu_data_unregister(x_handles[block]);
			
 
				-		starpu_data_unregister(y_handles[block]);
			
 
				+		starpu_data_unregister(_x_handles[block]);
			
 
				+		starpu_data_unregister(_y_handles[block]);
			
 
				 	}
			
 
				-	starpu_data_unregister(dot_handle);
			
 
				+	starpu_data_unregister(_dot_handle);
			
 
				 
			
 
				-	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
			
 
				+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, _dot, reference_dot - _dot);
			
 
				 
			
 
				 	starpu_cublas_shutdown();
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        ret = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+        ret = starpu_opencl_unload_opencl(&_opencl_program);
			
 
				         STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
			
 
				 #endif
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	free(x);
			
 
				-	free(y);
			
 
				-	free(x_handles);
			
 
				-	free(y_handles);
			
 
				+	free(_x);
			
 
				+	free(_y);
			
 
				+	free(_x_handles);
			
 
				+	free(_y_handles);
			
 
				 
			
 
				-	if (fabs(reference_dot - dot) < reference_dot * 1e-6)
			
 
				+	if (fabs(reference_dot - _dot) < reference_dot * 1e-6)
			
 
				 		return EXIT_SUCCESS;
			
 
				 	else
			
 
				 		return EXIT_FAILURE;
			
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -20,11 +20,11 @@
 
				 #include <starpu.h>
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				-static unsigned nblocks = 512;
			
 
				-static unsigned entries_per_bock = 64;
			
 
				+static unsigned _nblocks = 512;
			
 
				+static unsigned _entries_per_bock = 64;
			
 
				 #else
			
 
				-static unsigned nblocks = 8192;
			
 
				-static unsigned entries_per_bock = 1024;
			
 
				+static unsigned _nblocks = 8192;
			
 
				+static unsigned _entries_per_bock = 1024;
			
 
				 #endif
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
@@ -33,12 +33,12 @@ static unsigned entries_per_bock = 1024;
 
				 #define TYPE_MAX	DBL_MAX
			
 
				 #define TYPE_MIN	DBL_MIN
			
 
				 
			
 
				-static TYPE *x;
			
 
				-static starpu_data_handle_t *x_handles;
			
 
				+static TYPE *_x;
			
 
				+static starpu_data_handle_t *_x_handles;
			
 
				 
			
 
				 /* The first element (resp. second) stores the min element (resp. max). */
			
 
				-static TYPE minmax[2];
			
 
				-static starpu_data_handle_t minmax_handle;
			
 
				+static TYPE _minmax[2];
			
 
				+static starpu_data_handle_t _minmax_handle;
			
 
				 
			
 
				 /*
			
 
				  *	Codelet to create a neutral element
			
@@ -136,46 +136,46 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	unsigned long nelems = nblocks*entries_per_bock;
			
 
				+	unsigned long nelems = _nblocks*_entries_per_bock;
			
 
				 	size_t size = nelems*sizeof(TYPE);
			
 
				 
			
 
				-	x = (TYPE *) malloc(size);
			
 
				-	x_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				-	
			
 
				-	assert(x && x_handles);
			
 
				+	_x = (TYPE *) malloc(size);
			
 
				+	_x_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+	assert(_x && _x_handles);
			
 
				 
			
 
				 	/* Initialize the vector with random values */
			
 
				         starpu_srand48(0);
			
 
				 	for (i = 0; i < nelems; i++)
			
 
				-		x[i] = (TYPE)starpu_drand48();
			
 
				-	
			
 
				+		_x[i] = (TYPE)starpu_drand48();
			
 
				+
			
 
				 	unsigned block;
			
 
				-	for (block = 0; block < nblocks; block++)
			
 
				+	for (block = 0; block < _nblocks; block++)
			
 
				 	{
			
 
				-		uintptr_t block_start = (uintptr_t)&x[entries_per_bock*block];
			
 
				-		starpu_vector_data_register(&x_handles[block], 0, block_start,
			
 
				-						entries_per_bock, sizeof(TYPE));
			
 
				+		uintptr_t block_start = (uintptr_t)&_x[_entries_per_bock*block];
			
 
				+		starpu_vector_data_register(&_x_handles[block], 0, block_start,
			
 
				+					    _entries_per_bock, sizeof(TYPE));
			
 
				 	}
			
 
				 
			
 
				 	/* Initialize current min */
			
 
				-	minmax[0] = TYPE_MAX;
			
 
				+	_minmax[0] = TYPE_MAX;
			
 
				 
			
 
				 	/* Initialize current max */
			
 
				-	minmax[1] = TYPE_MIN;
			
 
				+	_minmax[1] = TYPE_MIN;
			
 
				 
			
 
				-	starpu_variable_data_register(&minmax_handle, 0, (uintptr_t)minmax, 2*sizeof(TYPE));
			
 
				+	starpu_variable_data_register(&_minmax_handle, 0, (uintptr_t)_minmax, 2*sizeof(TYPE));
			
 
				 
			
 
				 	/* Set the methods to define neutral elements and to perform the reduction operation */
			
 
				-	starpu_data_set_reduction_methods(minmax_handle, &minmax_redux_codelet, &minmax_init_codelet);
			
 
				+	starpu_data_set_reduction_methods(_minmax_handle, &minmax_redux_codelet, &minmax_init_codelet);
			
 
				 
			
 
				-	for (block = 0; block < nblocks; block++)
			
 
				+	for (block = 0; block < _nblocks; block++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				 		task->cl = &minmax_codelet;
			
 
				 
			
 
				-		task->handles[0] = x_handles[block];
			
 
				-		task->handles[1] = minmax_handle;
			
 
				+		task->handles[0] = _x_handles[block];
			
 
				+		task->handles[1] = _minmax_handle;
			
 
				 
			
 
				 		ret = starpu_task_submit(task);
			
 
				 		if (ret)
			
@@ -186,19 +186,19 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for (block = 0; block < nblocks; block++)
			
 
				+	for (block = 0; block < _nblocks; block++)
			
 
				 	{
			
 
				-		starpu_data_unregister(x_handles[block]);
			
 
				+		starpu_data_unregister(_x_handles[block]);
			
 
				 	}
			
 
				-	starpu_data_unregister(minmax_handle);
			
 
				+	starpu_data_unregister(_minmax_handle);
			
 
				 
			
 
				-	FPRINTF(stderr, "Min : %e\n", minmax[0]);
			
 
				-	FPRINTF(stderr, "Max : %e\n", minmax[1]);
			
 
				+	FPRINTF(stderr, "Min : %e\n", _minmax[0]);
			
 
				+	FPRINTF(stderr, "Max : %e\n", _minmax[1]);
			
 
				 
			
 
				-	STARPU_ASSERT(minmax[0] <= minmax[1]);
			
 
				+	STARPU_ASSERT(_minmax[0] <= _minmax[1]);
			
 
				 
			
 
				-	free(x);
			
 
				-	free(x_handles);
			
 
				+	free(_x);
			
 
				+	free(_x_handles);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/examples/spmd/vector_scal_spmd.c
+++ b/examples/spmd/vector_scal_spmd.c
@@ -62,7 +62,7 @@ void scal_cpu_func(void *buffers[], void *_args)
 
				 
			
 
				 
			
 
				 	for (i = 0; i < nel_worker; i++) {
			
 
				-		int rank = i + begin;
			
 
				+		rank = i + begin;
			
 
				 
			
 
				 		float v = val[rank];
			
 
				 		int j;
			
--- a/examples/tag_example/tag_example.c
+++ b/examples/tag_example/tag_example.c
@@ -86,25 +86,25 @@ static void parse_args(int argc, char **argv)
 
				 void callback_cpu(void *argcb);
			
 
				 static void express_deps(unsigned i, unsigned j, unsigned iter);
			
 
				 
			
 
				-static void tag_cleanup_grid(unsigned ni, unsigned nj, unsigned iter)
			
 
				+static void tag_cleanup_grid(unsigned piter)
			
 
				 {
			
 
				 	unsigned i,j;
			
 
				 
			
 
				 	for (j = 0; j < nj; j++)
			
 
				 	for (i = 0; i < ni; i++)
			
 
				 	{
			
 
				-		starpu_tag_remove(TAG(i,j,iter));
			
 
				+		starpu_tag_remove(TAG(i,j,piter));
			
 
				 	}
			
 
				 
			
 
				 
			
 
				 } 
			
 
				 
			
 
				-static int create_task_grid(unsigned iter)
			
 
				+static int create_task_grid(unsigned piter)
			
 
				 {
			
 
				 	unsigned i, j;
			
 
				 	int ret;
			
 
				 
			
 
				-/*	FPRINTF(stderr, "start iter %d...\n", iter); */
			
 
				+/*	FPRINTF(stderr, "start iter %d...\n", piter); */
			
 
				 	callback_cnt = (ni*nj);
			
 
				 
			
 
				 	/* create non-entry tasks */
			
@@ -119,10 +119,10 @@ static int create_task_grid(unsigned iter)
 
				 		task->cl_arg = NULL;
			
 
				 
			
 
				 		task->use_tag = 1;
			
 
				-		task->tag_id = TAG(i, j, iter);
			
 
				+		task->tag_id = TAG(i, j, piter);
			
 
				 
			
 
				 		/* express deps : (i,j) depends on (i-1, j-1) & (i-1, j+1) */
			
 
				-		express_deps(i, j, iter);
			
 
				+		express_deps(i, j, piter);
			
 
				 
			
 
				 		ret = starpu_task_submit(task);
			
 
				 		if (ret == -ENODEV) return 77;
			
@@ -140,7 +140,7 @@ static int create_task_grid(unsigned iter)
 
				 
			
 
				 		task->use_tag = 1;
			
 
				 		/* this is an entry task */
			
 
				-		task->tag_id = TAG(0, j, iter);
			
 
				+		task->tag_id = TAG(0, j, piter);
			
 
				 
			
 
				 		ret = starpu_task_submit(task);
			
 
				 		if (ret == -ENODEV) return 77;
			
@@ -160,7 +160,7 @@ void callback_cpu(void *argcb __attribute__ ((unused)))
 
				 		{
			
 
				 			/* cleanup old grids ... */
			
 
				 			if (iter > 2)
			
 
				-				tag_cleanup_grid(ni, nj, iter-2);
			
 
				+				tag_cleanup_grid(iter-2);
			
 
				 
			
 
				 			/* create a new iteration */
			
 
				 			create_task_grid(iter);
			
@@ -174,7 +174,7 @@ void cpu_codelet(void *descr[] __attribute__((unused)),
 
				 /*	printf("execute task\n"); */
			
 
				 }
			
 
				 
			
 
				-static void express_deps(unsigned i, unsigned j, unsigned iter)
			
 
				+static void express_deps(unsigned i, unsigned j, unsigned piter)
			
 
				 {
			
 
				 	if (j > 0)
			
 
				 	{
			
@@ -182,12 +182,12 @@ static void express_deps(unsigned i, unsigned j, unsigned iter)
 
				 		if (j < nj - 1)
			
 
				 		{
			
 
				 			/* (i,j+1) exists */
			
 
				-			starpu_tag_declare_deps(TAG(i,j,iter), 2, TAG(i-1,j-1,iter), TAG(i-1,j+1,iter));
			
 
				+			starpu_tag_declare_deps(TAG(i,j,piter), 2, TAG(i-1,j-1,piter), TAG(i-1,j+1,piter));
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			/* (i,j+1) does not exist */
			
 
				-			starpu_tag_declare_deps(TAG(i,j,iter), 1, TAG(i-1,j-1,iter));
			
 
				+			starpu_tag_declare_deps(TAG(i,j,piter), 1, TAG(i-1,j-1,piter));
			
 
				 		}
			
 
				 	}
			
 
				 	else
			
@@ -196,7 +196,7 @@ static void express_deps(unsigned i, unsigned j, unsigned iter)
 
				 		if (j < nj - 1)
			
 
				 		{
			
 
				 			/* (i,j+1) exists */
			
 
				-			starpu_tag_declare_deps(TAG(i,j,iter), 1, TAG(i-1,j+1,iter));
			
 
				+			starpu_tag_declare_deps(TAG(i,j,piter), 1, TAG(i-1,j+1,piter));
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -229,8 +229,8 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 	if (ret == 0)
			
 
				 	     starpu_task_wait_for_all();
			
 
				 
			
 
				-	tag_cleanup_grid(ni, nj, nk-2);
			
 
				-	tag_cleanup_grid(ni, nj, nk-1);
			
 
				+	tag_cleanup_grid(nk-2);
			
 
				+	tag_cleanup_grid(nk-1);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -66,7 +66,7 @@ static void parse_args(int argc, char **argv)
 
				 
			
 
				 void callback_cpu(void *argcb);
			
 
				 
			
 
				-static void tag_cleanup_grid(unsigned ni, unsigned iter)
			
 
				+static void tag_cleanup_grid(unsigned iter)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 
			
@@ -140,13 +140,13 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 
			
 
				 		/* cleanup old grids ... */
			
 
				 		if (i > 1)
			
 
				-			tag_cleanup_grid(ni, i-1);
			
 
				+			tag_cleanup_grid(i-1);
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 enodev:
			
 
				-	tag_cleanup_grid(ni, nk-1);
			
 
				+	tag_cleanup_grid(nk-1);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/examples/tag_example/tag_example3.c
+++ b/examples/tag_example/tag_example3.c
@@ -68,7 +68,7 @@ static void parse_args(int argc, char **argv)
 
				 
			
 
				 void callback_cpu(void *argcb);
			
 
				 
			
 
				-static void tag_cleanup_grid(unsigned ni, unsigned iter)
			
 
				+static void tag_cleanup_grid(unsigned iter)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 
			
@@ -142,7 +142,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 
			
 
				 		/* cleanup old grids ... */
			
 
				 		if (i > 1)
			
 
				-			tag_cleanup_grid(ni, i-1);
			
 
				+			tag_cleanup_grid(i-1);
			
 
				 	}
			
 
				 
			
 
				 enodev:
			
--- a/examples/tag_example/tag_restartable.c
+++ b/examples/tag_example/tag_restartable.c
@@ -159,9 +159,12 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
				 
			
 
				 	starpu_shutdown();
			
 
				 	FPRINTF(stderr, "TEST DONE ...\n");
			
 
				-	return EXIT_SUCCESS;
			
 
				 
			
 
				 enodev:
			
 
				-	starpu_shutdown();
			
 
				-	return 77;
			
 
				+	for (i = 0; i < Nrolls; i++)
			
 
				+	{
			
 
				+		free(tasks[i]);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -64,7 +64,7 @@ struct starpu_sched_ctx_performance_counters
 
				 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
			
 
				 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
			
 
				 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
			
 
				-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops, size_t data_size);
			
 
				+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
			
 
				 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
			
 
				 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
			
 
				 	void (*notify_delete_context)(unsigned sched_ctx);
			
@@ -72,7 +72,6 @@ struct starpu_sched_ctx_performance_counters
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
			
 
				-void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size);
			
 
				 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -25,12 +25,16 @@ extern "C"
 
				 {
			
 
				 #endif
			
 
				 
			
 
				+#define STARPU_MALLOC_PINNED	((1ULL)<<1)
			
 
				+#define STARPU_MALLOC_COUNT	((1ULL)<<3)
			
 
				+
			
 
				 void starpu_malloc_set_align(size_t align);
			
 
				+
			
 
				 int starpu_malloc(void **A, size_t dim);
			
 
				 int starpu_free(void *A);
			
 
				 
			
 
				-int starpu_malloc_count(void **A, size_t dim);
			
 
				-int starpu_free_count(void *A, size_t dim);
			
 
				+int starpu_malloc_flags(void **A, size_t dim, int flags);
			
 
				+int starpu_free_flags(void *A, size_t dim, int flags);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -144,6 +144,17 @@ int main(int argc, char **argv)
 
				 	fprintf(stderr, "Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			if (data_handles[x][y])
			
 
				+			{
			
 
				+				starpu_data_unregister(data_handles[x][y]);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/src/starpu_mpi_insert_task.c
+++ b/mpi/src/starpu_mpi_insert_task.c
@@ -70,7 +70,7 @@ void _starpu_mpi_cache_empty_tables(int world_size)
 
				 
			
 
				 	if (_cache_enabled == 0) return;
			
 
				 
			
 
				-	_STARPU_DEBUG("Clearing htable for cache\n");
			
 
				+	_STARPU_MPI_DEBUG("Clearing htable for cache\n");
			
 
				 
			
 
				 	for(i=0 ; i<world_size ; i++)
			
 
				 	{
			
@@ -121,15 +121,15 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
				 		if (avail)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data_handle);
			
 
				-			free(avail);
			
 
				 			HASH_DEL(_cache_sent_data[i], avail);
			
 
				+			free(avail);
			
 
				 		}
			
 
				 		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
			
 
				 		if (avail)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data_handle);
			
 
				-			free(avail);
			
 
				 			HASH_DEL(_cache_received_data[i], avail);
			
 
				+			free(avail);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -327,6 +327,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
				 					{
			
 
				 						_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
			
 
				 						HASH_DEL(_cache_sent_data[n], already_sent);
			
 
				+						free(already_sent);
			
 
				 					}
			
 
				 				}
			
 
				 			}
			
@@ -342,6 +343,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
				 #endif
			
 
				 					_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
			
 
				 					HASH_DEL(_cache_received_data[mpi_rank], already_received);
			
 
				+					free(already_received);
			
 
				 					starpu_data_invalidate_submit(data);
			
 
				 				}
			
 
				 			}
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -33,8 +33,12 @@ extern "C" {
 
				 //#define STARPU_MPI_VERBOSE	1
			
 
				 
			
 
				 #ifdef STARPU_MPI_VERBOSE
			
 
				+static int _debug_rank=-1;
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_MPI_VERBOSE
			
 
				 #  define _STARPU_MPI_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
			
 
				-    						int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);       \
			
 
				+	                                        if (_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank); \
			
 
				                                                 fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __func__ ,##args); \
			
 
				                                                 fflush(stderr); }} while(0);
			
 
				 #else
			
@@ -42,17 +46,17 @@ extern "C" {
 
				 #endif
			
 
				 
			
 
				 #define _STARPU_MPI_DISP(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
			
 
				-    						int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);       \
			
 
				-                                                fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __func__ ,##args); \
			
 
				-                                                fflush(stderr); }} while(0);
			
 
				+	       				     if (_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank); \
			
 
				+                                             fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __func__ ,##args); \
			
 
				+                                             fflush(stderr); }} while(0);
			
 
				 
			
 
				 #ifdef STARPU_MPI_VERBOSE0
			
 
				 #  define _STARPU_MPI_LOG_IN()             do { if (!getenv("STARPU_SILENT")) { \
			
 
				-                                               int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
			
 
				+                                               if (_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
			
 
				                                                fprintf(stderr, "%*s[%d][starpu_mpi][%s] -->\n", (_debug_rank+1)*4, "", _debug_rank, __func__ ); \
			
 
				                                                fflush(stderr); }} while(0)
			
 
				 #  define _STARPU_MPI_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) { \
			
 
				-                                               int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
			
 
				+                                               if (_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
			
 
				                                                fprintf(stderr, "%*s[%d][starpu_mpi][%s] <--\n", (_debug_rank+1)*4, "", _debug_rank, __func__ ); \
			
 
				                                                fflush(stderr); }} while(0)
			
 
				 #else
			
--- a/mpi/tests/insert_task_owner2.c
+++ b/mpi/tests/insert_task_owner2.c
@@ -116,6 +116,7 @@ int main(int argc, char **argv)
 
				         FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
			
 
				         FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
			
 
				 
			
 
				+	free(values);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_irecv.c
+++ b/mpi/tests/mpi_irecv.c
@@ -74,6 +74,8 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_isend.c
+++ b/mpi/tests/mpi_isend.c
@@ -75,6 +75,8 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_probe.c
+++ b/mpi/tests/mpi_probe.c
@@ -66,7 +66,6 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				 	tab = malloc(SIZE*sizeof(float));
			
 
				-
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				 	int nloops = NITER;
			
@@ -92,6 +91,8 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_test.c
+++ b/mpi/tests/mpi_test.c
@@ -81,6 +81,9 @@ int main(int argc, char **argv)
 
				 		while (!finished);
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
+++ b/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
@@ -167,7 +167,7 @@ struct sched_ctx_hypervisor_policy
 
				 	void (*size_ctxs)(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
			
 
				 	void (*handle_idle_cycle)(unsigned sched_ctx, int worker);
			
 
				 	void (*handle_pushed_task)(unsigned sched_ctx, int worker);
			
 
				-	void (*handle_poped_task)(unsigned sched_ctx, int worker);
			
 
				+	void (*handle_poped_task)(unsigned sched_ctx, int worker,struct starpu_task *task, uint32_t footprint);
			
 
				 	void (*handle_idle_end)(unsigned sched_ctx, int worker);
			
 
				 
			
 
				 	void (*handle_post_exec_hook)(unsigned sched_ctx, int task_tag);
			
@@ -230,6 +230,8 @@ unsigned sched_ctx_hypervisor_can_resize(unsigned sched_ctx);
 
				 /* compute an average value of the cpu/cuda velocity */
			
 
				 double sched_ctx_hypervisor_get_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype arch);
			
 
				 
			
 
				+double sched_ctx_hypervisor_get_velocity(struct sched_ctx_hypervisor_wrapper *sc_w, enum starpu_archtype arch);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/debit_lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/debit_lp_policy.c
@@ -38,16 +38,8 @@ static unsigned _compute_max_velocity(int ns, int nw, double w_in_s[ns][nw], int
 
				 			w_in_s[s][w] = 0.0;
			
 
				 			int worker = workers == NULL ? w : workers[w];
			
 
				 
			
 
				-			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
			
 
				-			if(velocity[s][w] == -1.0)
			
 
				-			{
			
 
				-				enum starpu_archtype arch = starpu_worker_get_type(worker);
			
 
				-				velocity[s][w] = _get_velocity_per_worker_type(sc_w, arch);
			
 
				-				if(velocity[s][w] == -1.0)
			
 
				-					velocity[s][w] = sc_w->ref_velocity[worker];
			
 
				-				if(velocity[s][w] < 1.0)
			
 
				-					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
			
 
				-			}
			
 
				+			enum starpu_archtype arch = starpu_worker_get_type(worker);
			
 
				+			velocity[s][w] = sched_ctx_hypervisor_get_velocity(sc_w, arch);
			
 
				 		}
			
 
				 	}
			
 
				 	
			
@@ -231,7 +223,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_
 
				 }
			
 
				 
			
 
				 
			
 
				-static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker)
			
 
				+static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				 {
			
 
				 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
			
 
				 	_get_velocity_per_worker(sc_w, worker);
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
@@ -289,7 +289,7 @@ static void gflops_rate_resize(unsigned sched_ctx)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void gflops_rate_handle_poped_task(unsigned sched_ctx, int worker)
			
 
				+static void gflops_rate_handle_poped_task(unsigned sched_ctx, int worker)
			
 
				 {
			
 
				 	gflops_rate_resize(sched_ctx);
			
 
				 }
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
@@ -19,8 +19,6 @@
 
				 #include <math.h>
			
 
				 
			
 
				 static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops[ns], double tmax, double flops_on_w[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer);
			
 
				-static double _find_tmax(double t1, double t2);
			
 
				-
			
 
				 
			
 
				 static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double flops_on_w[ns][nw], int *in_sched_ctxs, int *workers)
			
 
				 {
			
@@ -50,11 +48,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 			if(velocity[s][w] == -1.0)
			
 
				 			{
			
 
				 				enum starpu_archtype arch = starpu_worker_get_type(worker);
			
 
				-				velocity[s][w] = _get_velocity_per_worker_type(sc_w, arch);
			
 
				-				if(velocity[s][w] == -1.0)
			
 
				-					velocity[s][w] = sc_w->ref_velocity[worker];
			
 
				-				if(velocity[s][w] == -1.0)
			
 
				-					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
			
 
				+				velocity[s][w] = sched_ctx_hypervisor_get_velocity(sc_w, arch);
			
 
				 			}
			
 
				 			
			
 
				 //			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
@@ -348,13 +342,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 
				 }
			
 
				 
			
 
				 
			
 
				-static double _find_tmax(double t1, double t2)
			
 
				-{
			
 
				-	return t1 + ((t2 - t1)/2);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker)
			
 
				+static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				 {
			
 
				 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
			
 
				 	_get_velocity_per_worker(sc_w, worker);
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
@@ -141,7 +141,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 
				 	return curr_workers;
			
 
				 }			
			
 
				 
			
 
				-static void ispeed_handle_poped_task(unsigned sched_ctx, int worker)
			
 
				+static void ispeed_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				 {
			
 
				 	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
			
 
				 	if(ret != EBUSY)
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
@@ -22,7 +22,6 @@ static struct bound_task_pool *task_pools = NULL;
 
				 
			
 
				 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
			
 
				 static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger);
			
 
				-static double _find_tmax(double t1, double t2);
			
 
				 static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], int *sched_ctxs, int *workers)
			
 
				 {
			
 
				 	double draft_tasks[nw][nt];
			
@@ -45,12 +44,12 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 
				 
			
 
				 	/* smallest possible tmax, difficult to obtain as we
			
 
				 	   compute the nr of flops and not the tasks */
			
 
				-	double smallest_tmax = _lp_get_tmax(nw, workers);
			
 
				-	double tmax = smallest_tmax * ns * 2;
			
 
				-
			
 
				+	double possible_tmax = _lp_get_tmax(nw, workers);
			
 
				+	double smallest_tmax = possible_tmax / 2;
			
 
				+	double tmax = possible_tmax * ns;
			
 
				 	double res = 1.0;
			
 
				 	unsigned has_sol = 0;
			
 
				-	double tmin = 0.0;
			
 
				+	double tmin = smallest_tmax;
			
 
				 	double old_tmax = 0.0;
			
 
				 	unsigned found_sol = 0;
			
 
				 
			
@@ -114,118 +113,16 @@ static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, dou
 
				 	float timing = (float)(diff_s*1000000 + diff_us)/1000;
			
 
				 
			
 
				 //        fprintf(stdout, "nd = %d total time: %f ms \n", nd, timing);
			
 
				-
			
 
				 	return found_sol;
			
 
				 }
			
 
				 
			
 
				-static void _redistribute_resources_in_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], unsigned first_time, int *in_sched_ctxs, int *workers)
			
 
				-{
			
 
				-	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
			
 
				-	int s, s2, w;
			
 
				-
			
 
				-	for(s = 0; s < ns; s++)
			
 
				-	{
			
 
				-		int workers_to_add[nw], workers_to_remove[nw];
			
 
				-		int destination_ctx[nw][ns];
			
 
				-
			
 
				-		for(w = 0; w < nw; w++)
			
 
				-		{
			
 
				-			workers_to_add[w] = -1;
			
 
				-			workers_to_remove[w] = -1;
			
 
				-			for(s2 = 0; s2 < ns; s2++)
			
 
				-				destination_ctx[w][s2] = -1;
			
 
				-		}
			
 
				-
			
 
				-		int nadd = 0, nremove = 0;
			
 
				-
			
 
				-		for(w = 0; w < nw; w++)
			
 
				-		{
			
 
				-			enum starpu_perf_archtype arch = workers == NULL ? starpu_worker_get_type(w) :
			
 
				-				starpu_worker_get_type(workers[w]);
			
 
				-
			
 
				-			if(arch == STARPU_CPU_WORKER)
			
 
				-			{
			
 
				-				if(w_in_s[s][w] >= 0.5)
			
 
				-				{
			
 
				-					workers_to_add[nadd++] = workers == NULL ? w : workers[w];
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					workers_to_remove[nremove++] = workers == NULL ? w : workers[w];
			
 
				-					for(s2 = 0; s2 < ns; s2++)
			
 
				-						if(s2 != s && w_in_s[s2][w] >= 0.5)
			
 
				-							destination_ctx[w][s2] = 1;
			
 
				-						else
			
 
				-							destination_ctx[w][s2] = 0;
			
 
				-				}
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				if(w_in_s[s][w] >= 0.3)
			
 
				-				{
			
 
				-					workers_to_add[nadd++] = workers == NULL ? w : workers[w];
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					workers_to_remove[nremove++] = workers == NULL ? w : workers[w];
			
 
				-					for(s2 = 0; s2 < ns; s2++)
			
 
				-						if(s2 != s && w_in_s[s2][w] >= 0.3)
			
 
				-							destination_ctx[w][s2] = 1;
			
 
				-						else
			
 
				-							destination_ctx[w][s2] = 0;
			
 
				-				}
			
 
				-			}
			
 
				-
			
 
				-		}
			
 
				-
			
 
				-		sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_to_add, nadd, sched_ctxs[s]);
			
 
				-		struct sched_ctx_hypervisor_policy_config *new_config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
			
 
				-		int i;
			
 
				-		for(i = 0; i < nadd; i++)
			
 
				-			new_config->max_idle[workers_to_add[i]] = new_config->max_idle[workers_to_add[i]] != MAX_IDLE_TIME ? new_config->max_idle[workers_to_add[i]] :  new_config->new_workers_max_idle;
			
 
				-
			
 
				-		if(!first_time)
			
 
				-		{
			
 
				-			/* do not remove workers if they can't go anywhere */
			
 
				-			int w2;
			
 
				-			unsigned found_one_dest[nremove];
			
 
				-			unsigned all_have_dest = 1;
			
 
				-			for(w2 = 0; w2 < nremove; w2++)
			
 
				-				found_one_dest[w2] = 0;
			
 
				-
			
 
				-			for(w2 = 0; w2 < nremove; w2++)
			
 
				-				for(s2 = 0; s2 < ns; s2++)
			
 
				-				{
			
 
				-					/* if the worker has to be removed we should find a destination
			
 
				-					   otherwise we are not interested */
			
 
				-					if(destination_ctx[w2][s2] == -1)
			
 
				-						found_one_dest[w2] = -1;
			
 
				-					if(destination_ctx[w2][s2] == 1)// && sched_ctx_hypervisor_can_resize(sched_ctxs[s2]))
			
 
				-					{
			
 
				-						found_one_dest[w2] = 1;
			
 
				-						break;
			
 
				-					}
			
 
				-				}
			
 
				-			for(w2 = 0; w2 < nremove; w2++)
			
 
				-			{
			
 
				-				if(found_one_dest[w2] == 0)
			
 
				-				{
			
 
				-					all_have_dest = 0;
			
 
				-					break;
			
 
				-				}
			
 
				-			}
			
 
				-			if(all_have_dest)
			
 
				-				sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_to_remove, nremove, sched_ctxs[s], 0);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-}
			
 
				 
			
 
				 static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
			
 
				 {
			
 
				 	int ns = sched_ctxs == NULL ? sched_ctx_hypervisor_get_nsched_ctxs() : nsched_ctxs;
			
 
				-	int nw = workers == NULL ? starpu_worker_get_count() : nworkers; /* Number of different workers */
			
 
				+	int nw = workers == NULL ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
			
 
				 	int nt = 0; /* Number of different kinds of tasks */
			
 
				+	pthread_mutex_lock(&mutex);
			
 
				 	struct bound_task_pool * tp;
			
 
				 	for (tp = task_pools; tp; tp = tp->next)
			
 
				 		nt++;
			
@@ -233,63 +130,10 @@ static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nwor
 
				 	double w_in_s[ns][nw];
			
 
				 	double tasks[nw][nt];
			
 
				 	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers);
			
 
				+	pthread_mutex_unlock(&mutex);
			
 
				 	/* if we did find at least one solution redistribute the resources */
			
 
				 	if(found_sol)
			
 
				-	{
			
 
				-		int w, s;
			
 
				-		double nworkers[ns][2];
			
 
				-		int nworkers_rounded[ns][2];
			
 
				-		for(s = 0; s < ns; s++)
			
 
				-		{
			
 
				-			nworkers[s][0] = 0.0;
			
 
				-			nworkers[s][1] = 0.0;
			
 
				-			nworkers_rounded[s][0] = 0;
			
 
				-			nworkers_rounded[s][1] = 0;
			
 
				-			
			
 
				-		}
			
 
				-		
			
 
				-		for(s = 0; s < ns; s++)
			
 
				-		{
			
 
				-			for(w = 0; w < nw; w++)
			
 
				-			{
			
 
				-				enum starpu_perf_archtype arch = starpu_worker_get_type(w);
			
 
				-				
			
 
				-				if(arch == STARPU_CUDA_WORKER)
			
 
				-				{
			
 
				-					nworkers[s][0] += w_in_s[s][w];
			
 
				-					if(w_in_s[s][w] >= 0.3)
			
 
				-						nworkers_rounded[s][0]++;
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					nworkers[s][1] += w_in_s[s][w];
			
 
				-					if(w_in_s[s][w] > 0.5)
			
 
				-						nworkers_rounded[s][1]++;
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		int *current_sched_ctxs = sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : 
			
 
				-			sched_ctxs;
			
 
				-
			
 
				-		unsigned has_workers = 0;
			
 
				-		for(s = 0; s < ns; s++)
			
 
				-		{
			
 
				-			int nworkers_ctx = sched_ctx_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], 
			
 
				-									     STARPU_ANY_WORKER);
			
 
				-			if(nworkers_ctx != 0)
			
 
				-			{
			
 
				-				has_workers = 1;
			
 
				-				break;
			
 
				-			}
			
 
				-		}
			
 
				-		if(has_workers)
			
 
				-			_lp_redistribute_resources_in_ctxs(nsched_ctxs, 2, nworkers_rounded, nworkers);
			
 
				-		else
			
 
				-			_lp_distribute_resources_in_ctxs(sched_ctxs, nsched_ctxs, 2, nworkers_rounded, nworkers, workers, nworkers);
			
 
				-	
			
 
				-//		_redistribute_resources_in_ctxs(ns, nw, nt, w_in_s, 1, sched_ctxs, workers);
			
 
				-	}
			
 
				+		_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, workers, 1);
			
 
				 }
			
 
				 
			
 
				 static void size_if_required()
			
@@ -325,7 +169,7 @@ static void lp2_handle_submitted_job(struct starpu_task *task, uint32_t footprin
 
				 
			
 
				 	for (tp = task_pools; tp; tp = tp->next)
			
 
				 	{
			
 
				-		if (tp->cl == task->cl && tp->footprint == footprint && tp->sched_ctx_id == task->sched_ctx)
			
 
				+		if (tp && tp->cl == task->cl && tp->footprint == footprint && tp->sched_ctx_id == task->sched_ctx)
			
 
				 			break;
			
 
				 	}
			
 
				 
			
@@ -347,7 +191,38 @@ static void lp2_handle_submitted_job(struct starpu_task *task, uint32_t footprin
 
				 	size_if_required();
			
 
				 }
			
 
				 
			
 
				-static void _starpu_get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
			
 
				+static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
			
 
				+{
			
 
				+	/* count the tasks of the same type */
			
 
				+	pthread_mutex_lock(&mutex);
			
 
				+	struct bound_task_pool *tp = NULL;
			
 
				+
			
 
				+	for (tp = task_pools; tp; tp = tp->next)
			
 
				+	{
			
 
				+		if (tp && tp->cl == task->cl && tp->footprint == footprint && tp->sched_ctx_id == task->sched_ctx)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	if (tp)
			
 
				+	{
			
 
				+		if(tp->n > 1)
			
 
				+			tp->n--;
			
 
				+		else
			
 
				+		{
			
 
				+			struct bound_task_pool *prev_tp = NULL;
			
 
				+			for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
			
 
				+			{
			
 
				+				if (prev_tp->next == tp)
			
 
				+					prev_tp->next = tp->next;
			
 
				+			}
			
 
				+
			
 
				+			free(tp);
			
 
				+		}
			
 
				+	}
			
 
				+	pthread_mutex_unlock(&mutex);
			
 
				+}
			
 
				+
			
 
				+static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
			
 
				 {
			
 
				         struct bound_task_pool *tp;
			
 
				         int w, t;
			
@@ -374,6 +249,8 @@ static void _starpu_get_tasks_times(int nw, int nt, double times[nw][nt], int *w
 
				 #include <glpk.h>
			
 
				 static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer)
			
 
				 {
			
 
				+	if(task_pools == NULL)
			
 
				+		return 0.0;
			
 
				 	struct bound_task_pool * tp;
			
 
				 	int t, w, s;
			
 
				 	glp_prob *lp;
			
@@ -393,7 +270,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 		int ia[ne], ja[ne];
			
 
				 		double ar[ne];
			
 
				 
			
 
				-		_starpu_get_tasks_times(nw, nt, times, workers);
			
 
				+		_get_tasks_times(nw, nt, times, workers);
			
 
				 
			
 
				 		/* Variables: number of tasks i assigned to worker j, and tmax */
			
 
				 		glp_add_cols(lp, nw*nt+ns*nw);
			
@@ -408,7 +285,13 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 				char name[32];
			
 
				 				snprintf(name, sizeof(name), "w%dt%dn", w, t);
			
 
				 				glp_set_col_name(lp, colnum(w, t), name);
			
 
				-				glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0., 0.);
			
 
				+/* 				if (integer) */
			
 
				+/*                                 { */
			
 
				+/*                                         glp_set_col_kind(lp, colnum(w, t), GLP_IV); */
			
 
				+/* 					glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0, 0); */
			
 
				+/*                                 } */
			
 
				+/* 				else */
			
 
				+					glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0.0, 0.0);
			
 
				 			}
			
 
				 		for(s = 0; s < ns; s++)
			
 
				 			for(w = 0; w < nw; w++)
			
@@ -439,6 +322,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 			if (!someone)
			
 
				 			{
			
 
				 				/* This task does not have any performance model at all, abort */
			
 
				+				printf("NO PERF MODELS\n");
			
 
				 				glp_delete_prob(lp);
			
 
				 				return 0.0;
			
 
				 			}
			
@@ -454,7 +338,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
			
 
				 				for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				 				{
			
 
				-					if(tp->sched_ctx_id == sched_ctxs[s])
			
 
				+					if((int)tp->sched_ctx_id == sched_ctxs[s])
			
 
				 					{
			
 
				 						ia[n] = curr_row_idx+s*nw+w+1;
			
 
				 						ja[n] = colnum(w, t);
			
@@ -529,6 +413,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 	int ret = glp_simplex(lp, &parm);
			
 
				 	if (ret)
			
 
				 	{
			
 
				+		printf("error in simplex\n");
			
 
				 		glp_delete_prob(lp);
			
 
				 		lp = NULL;
			
 
				 		return 0.0;
			
@@ -539,6 +424,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 	if(stat == GLP_NOFEAS)
			
 
				 	{
			
 
				 		glp_delete_prob(lp);
			
 
				+//		printf("no_sol in tmax = %lf\n", tmax);
			
 
				 		lp = NULL;
			
 
				 		return 0.0;
			
 
				 	}
			
@@ -554,6 +440,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 		/* if we don't have a solution return */
			
 
				 		if(stat == GLP_NOFEAS)
			
 
				 		{
			
 
				+//			printf("no int sol in tmax = %lf\n", tmax);
			
 
				 			glp_delete_prob(lp);
			
 
				 			lp = NULL;
			
 
				 			return 0.0;
			
@@ -563,8 +450,12 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 	double res = glp_get_obj_val(lp);
			
 
				 	for (w = 0; w < nw; w++)
			
 
				 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
			
 
				-			tasks[w][t] = glp_get_col_prim(lp, colnum(w, t));
			
 
				-
			
 
				+/* 			if (integer) */
			
 
				+/* 				tasks[w][t] = (double)glp_mip_col_val(lp, colnum(w, t)); */
			
 
				+/*                         else */
			
 
				+				tasks[w][t] = glp_get_col_prim(lp, colnum(w, t));
			
 
				+	
			
 
				+//	printf("for tmax %lf\n", tmax);
			
 
				 	for(s = 0; s < ns; s++)
			
 
				 		for(w = 0; w < nw; w++)
			
 
				 		{
			
@@ -572,21 +463,18 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 
				 				w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*nt+s*nw+w+1);
			
 
				                         else
			
 
				 				w_in_s[s][w] = glp_get_col_prim(lp, nw*nt+s*nw+w+1);
			
 
				+//			printf("w_in_s[%d][%d]=%lf\n", s, w, w_in_s[s][w]);
			
 
				 		}
			
 
				+//	printf("\n");
			
 
				 
			
 
				 	glp_delete_prob(lp);
			
 
				 	return res;
			
 
				 }
			
 
				 
			
 
				 
			
 
				-static double _find_tmax(double t1, double t2)
			
 
				-{
			
 
				-	return t1 + ((t2 - t1)/2);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static void lp2_handle_poped_task(unsigned sched_ctx, int worker)
			
 
				+static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				 {
			
 
				+	_remove_task_from_pool(task, footprint);
			
 
				 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
			
 
				 
			
 
				 	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
			
@@ -603,6 +491,7 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker)
 
				 			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
			
 
				 			int nw = starpu_worker_get_count(); /* Number of different workers */
			
 
				 			int nt = 0; /* Number of different kinds of tasks */
			
 
				+			pthread_mutex_lock(&mutex);
			
 
				 			struct bound_task_pool * tp;
			
 
				 			for (tp = task_pools; tp; tp = tp->next)
			
 
				 				nt++;
			
@@ -611,48 +500,12 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker)
 
				 			double tasks_per_worker[nw][nt];
			
 
				 
			
 
				 			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL);
			
 
				+			pthread_mutex_unlock(&mutex);
			
 
				 			/* if we did find at least one solution redistribute the resources */
			
 
				 			if(found_sol)
			
 
				-			{
			
 
				-				int w, s;
			
 
				-				double nworkers[ns][2];
			
 
				-				int nworkers_rounded[ns][2];
			
 
				-				for(s = 0; s < ns; s++)
			
 
				-				{
			
 
				-					nworkers[s][0] = 0.0;
			
 
				-					nworkers[s][1] = 0.0;
			
 
				-					nworkers_rounded[s][0] = 0;
			
 
				-					nworkers_rounded[s][1] = 0;
			
 
				-
			
 
				-				}
			
 
				+				_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
			
 
				 
			
 
				-				for(s = 0; s < ns; s++)
			
 
				-				{
			
 
				-					for(w = 0; w < nw; w++)
			
 
				-					{
			
 
				-						enum starpu_perf_archtype arch = starpu_worker_get_type(w);
			
 
				-
			
 
				-						if(arch == STARPU_CUDA_WORKER)
			
 
				-						{
			
 
				-							nworkers[s][0] += w_in_s[s][w];
			
 
				-							if(w_in_s[s][w] >= 0.3)
			
 
				-								nworkers_rounded[s][0]++;
			
 
				-						}
			
 
				-						else
			
 
				-						{
			
 
				-							nworkers[s][1] += w_in_s[s][w];
			
 
				-							if(w_in_s[s][w] > 0.5)
			
 
				-								nworkers_rounded[s][1]++;
			
 
				-						}
			
 
				-					}
			
 
				-				}
			
 
				-/* 				for(s = 0; s < ns; s++) */
			
 
				-/* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
			
 
				-/* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
			
 
				-
			
 
				-				_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
			
 
				 
			
 
				-			}
			
 
				 		}
			
 
				 		pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	}
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c
@@ -19,7 +19,7 @@
 
				 
			
 
				 
			
 
				 #ifdef STARPU_HAVE_GLPK_H
			
 
				-static void lp_handle_poped_task(unsigned sched_ctx, int worker)
			
 
				+static void lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				 {
			
 
				 	if(_velocity_gap_btw_ctxs())
			
 
				 	{
			
@@ -67,20 +67,20 @@ static void lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworkers)
 
				 	double vmax = _lp_get_nworkers_per_ctx(nsched_ctxs, 2, nworkers_per_type, total_nw);
			
 
				 	if(vmax != 0.0)
			
 
				 	{
			
 
				-		printf("********size\n");
			
 
				-		int i;
			
 
				-		for( i = 0; i < nsched_ctxs; i++)
			
 
				-		{
			
 
				-			printf("ctx %d/worker type %d: n = %lf \n", i, 0, nworkers_per_type[i][0]);
			
 
				-			printf("ctx %d/worker type %d: n = %lf \n", i, 1, nworkers_per_type[i][1]);
			
 
				-		}
			
 
				+/*  		printf("********size\n"); */
			
 
				+/* 		int i; */
			
 
				+/* 		for( i = 0; i < nsched_ctxs; i++) */
			
 
				+/* 		{ */
			
 
				+/* 			printf("ctx %d/worker type %d: n = %lf \n", i, 0, nworkers_per_type[i][0]); */
			
 
				+/* 			printf("ctx %d/worker type %d: n = %lf \n", i, 1, nworkers_per_type[i][1]); */
			
 
				+/* 		} */
			
 
				 		int nworkers_per_type_rounded[nsched_ctxs][2];
			
 
				 		_lp_round_double_to_int(nsched_ctxs, 2, nworkers_per_type, nworkers_per_type_rounded);
			
 
				-      		for( i = 0; i < nsched_ctxs; i++)
			
 
				-		{
			
 
				-			printf("ctx %d/worker type %d: n = %d \n", i, 0, nworkers_per_type_rounded[i][0]);
			
 
				-			printf("ctx %d/worker type %d: n = %d \n", i, 1, nworkers_per_type_rounded[i][1]);
			
 
				-		}
			
 
				+/*       		for( i = 0; i < nsched_ctxs; i++) */
			
 
				+/* 		{ */
			
 
				+/* 			printf("ctx %d/worker type %d: n = %d \n", i, 0, nworkers_per_type_rounded[i][0]); */
			
 
				+/* 			printf("ctx %d/worker type %d: n = %d \n", i, 1, nworkers_per_type_rounded[i][1]); */
			
 
				+/* 		} */
			
 
				 		int *current_sched_ctxs = sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : 
			
 
				 			sched_ctxs;
			
 
				 
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
@@ -22,6 +22,7 @@
 
				 
			
 
				 double _lp_compute_nworkers_per_ctx(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], int  total_nw[nw])
			
 
				 {
			
 
				+	int integer = 1;
			
 
				 	int s, w;
			
 
				 	glp_prob *lp;
			
 
				 
			
@@ -49,7 +50,13 @@ double _lp_compute_nworkers_per_ctx(int ns, int nw, double v[ns][nw], double flo
 
				 			char name[32];
			
 
				 			snprintf(name, sizeof(name), "worker%dctx%d", w, s);
			
 
				 			glp_set_col_name(lp, n, name);
			
 
				-			glp_set_col_bnds(lp, n, GLP_LO, 0.3, 0.0);
			
 
				+			if (integer)
			
 
				+			{
			
 
				+				glp_set_col_kind(lp, n, GLP_IV);
			
 
				+				glp_set_col_bnds(lp, n, GLP_LO, 0, 0);
			
 
				+			}
			
 
				+			else
			
 
				+				glp_set_col_bnds(lp, n, GLP_LO, 0.0, 0.0);
			
 
				 			n++;
			
 
				 		}
			
 
				 	}
			
@@ -154,7 +161,42 @@ double _lp_compute_nworkers_per_ctx(int ns, int nw, double v[ns][nw], double flo
 
				 	glp_smcp parm;
			
 
				 	glp_init_smcp(&parm);
			
 
				 	parm.msg_lev = GLP_MSG_OFF;
			
 
				-	glp_simplex(lp, &parm);
			
 
				+	int ret = glp_simplex(lp, &parm);
			
 
				+	if (ret)
			
 
				+        {
			
 
				+                printf("error in simplex\n");
			
 
				+		glp_delete_prob(lp);
			
 
				+                lp = NULL;
			
 
				+                return 0.0;
			
 
				+        }
			
 
				+
			
 
				+	int stat = glp_get_prim_stat(lp);
			
 
				+        /* if we don't have a solution return */
			
 
				+        if(stat == GLP_NOFEAS)
			
 
				+        {
			
 
				+                glp_delete_prob(lp);
			
 
				+//              printf("no_sol in tmax = %lf\n", tmax);                                                                                                                                                             
			
 
				+                lp = NULL;
			
 
				+                return 0.0;
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+	if (integer)
			
 
				+        {
			
 
				+                glp_iocp iocp;
			
 
				+                glp_init_iocp(&iocp);
			
 
				+                iocp.msg_lev = GLP_MSG_OFF;
			
 
				+                glp_intopt(lp, &iocp);
			
 
				+                int stat = glp_mip_status(lp);
			
 
				+                /* if we don't have a solution return */
			
 
				+                if(stat == GLP_NOFEAS)
			
 
				+                {
			
 
				+//                      printf("no int sol in tmax = %lf\n", tmax);                                                                                                                                                 
			
 
				+                        glp_delete_prob(lp);
			
 
				+                        lp = NULL;
			
 
				+                        return 0.0;
			
 
				+                }
			
 
				+        }
			
 
				 
			
 
				 	double vmax = glp_get_obj_val(lp);
			
 
				 
			
@@ -163,7 +205,11 @@ double _lp_compute_nworkers_per_ctx(int ns, int nw, double v[ns][nw], double flo
 
				 	{
			
 
				 		for(w = 0; w < nw; w++)
			
 
				 		{
			
 
				-			res[s][w] = glp_get_col_prim(lp, n);
			
 
				+			if (integer)
			
 
				+                                res[s][w] = (double)glp_mip_col_val(lp, n);
			
 
				+			else
			
 
				+				res[s][w] = glp_get_col_prim(lp, n);
			
 
				+//			printf("%d/%d: res %lf flops = %lf v = %lf\n", w,s, res[s][w], flops[s], v[s][w]);
			
 
				 			n++;
			
 
				 		}
			
 
				 	}
			
@@ -186,20 +232,11 @@ double _lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double r
 
				 	for(i = 0; i < nsched_ctxs; i++)
			
 
				 	{
			
 
				 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]);
			
 
				-		v[i][0] = _get_velocity_per_worker_type(sc_w, STARPU_CUDA_WORKER);
			
 
				-		if(v[i][0] == -1.0)
			
 
				-			v[i][0] = _get_ref_velocity_per_worker_type(sc_w, STARPU_CUDA_WORKER);
			
 
				-		if(v[i][0] == -1.0)
			
 
				-			v[i][0] = 20.0;
			
 
				-		v[i][1] = _get_velocity_per_worker_type(sc_w, STARPU_CPU_WORKER);
			
 
				-
			
 
				-		if(v[i][1] == -1.0)
			
 
				-			v[i][0] = _get_ref_velocity_per_worker_type(sc_w, STARPU_CPU_WORKER);
			
 
				-		if(v[i][1] == -1.0)
			
 
				-			v[i][1] = 200.0;
			
 
				+		v[i][0] = sched_ctx_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
			
 
				+		v[i][1] = sched_ctx_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
			
 
				 
			
 
				 		flops[i] = sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
			
 
				-//			printf("%d: flops %lf\n", sched_ctxs[i], flops[i]);
			
 
				+//		printf("%d: flops %lf\n", sched_ctxs[i], flops[i]);
			
 
				 	}
			
 
				 
			
 
				 	return 1/_lp_compute_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw);
			
@@ -272,6 +309,8 @@ void _lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded
 
				 					}
			
 
				 				}
			
 
				 			}
			
 
				+			else 
			
 
				+				res_rounded[s][w] = x;
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -509,9 +548,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 
			
 
				 void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers)
			
 
				 {
			
 
				-	int current_nworkers = workers == NULL ? starpu_worker_get_count() : nworkers;
			
 
				-	int *current_sched_ctxs = sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : sched_ctxs;
			
 
				-
			
 
				+	unsigned current_nworkers = workers == NULL ? starpu_worker_get_count() : (unsigned)nworkers;
			
 
				 	int s, w;
			
 
				 	int start[nw];
			
 
				 	for(w = 0; w < nw; w++)
			
@@ -546,29 +583,23 @@ void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_r
 
				 				if(diff == 0.0)
			
 
				 				{
			
 
				 					int *workers_to_add = _get_first_workers_in_list(&start[w], workers, current_nworkers, &x, arch);
			
 
				-					if(x > 0)
			
 
				-					{
			
 
				-						int i;
			
 
				-						for(i = 0; i < x; i++)
			
 
				-							workers_add[nw_add++] = workers_to_add[i];
			
 
				-					}
			
 
				+					int i;
			
 
				+					for(i = 0; i < x; i++)
			
 
				+						workers_add[nw_add++] = workers_to_add[i];
			
 
				 					free(workers_to_add);
			
 
				 				}
			
 
				 				else
			
 
				 				{
			
 
				 					x+=1;
			
 
				 					int *workers_to_add = _get_first_workers_in_list(&start[w], workers, current_nworkers, &x, arch);
			
 
				-					if(x > 0)
			
 
				-					{
			
 
				-						int i;
			
 
				-						if(diff >= 0.3)
			
 
				-							for(i = 0; i < x; i++)
			
 
				-								workers_add[nw_add++] = workers_to_add[i];
			
 
				-						else
			
 
				-							for(i = 0; i < x-1; i++)
			
 
				-								workers_add[nw_add++] = workers_to_add[i];
			
 
				+					int i;
			
 
				+					if(diff >= 0.3)
			
 
				+						for(i = 0; i < x; i++)
			
 
				+							workers_add[nw_add++] = workers_to_add[i];
			
 
				+					else
			
 
				+						for(i = 0; i < x-1; i++)
			
 
				+							workers_add[nw_add++] = workers_to_add[i];
			
 
				 
			
 
				-					}
			
 
				 					free(workers_to_add);
			
 
				 				}
			
 
				 			}
			
@@ -582,3 +613,72 @@ void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_r
 
				 //		sched_ctx_hypervisor_stop_resize(current_sched_ctxs[s]);
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+/* nw = all the workers (either in a list or on all machine) */
			
 
				+void _lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sched_ctxs_input, int *workers_input, unsigned do_size)
			
 
				+{
			
 
				+	int w, s;
			
 
				+	double nworkers[ns][2];
			
 
				+	int nworkers_rounded[ns][2];
			
 
				+	for(s = 0; s < ns; s++)
			
 
				+	{
			
 
				+		nworkers[s][0] = 0.0;
			
 
				+		nworkers[s][1] = 0.0;
			
 
				+		nworkers_rounded[s][0] = 0;
			
 
				+		nworkers_rounded[s][1] = 0;
			
 
				+		
			
 
				+	}
			
 
				+	
			
 
				+	for(s = 0; s < ns; s++)
			
 
				+	{
			
 
				+		for(w = 0; w < nw; w++)
			
 
				+		{
			
 
				+			enum starpu_archtype arch = starpu_worker_get_type(w);
			
 
				+			
			
 
				+			if(arch == STARPU_CUDA_WORKER)
			
 
				+			{
			
 
				+				nworkers[s][0] += w_in_s[s][w];
			
 
				+				if(w_in_s[s][w] >= 0.3)
			
 
				+					nworkers_rounded[s][0]++;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				nworkers[s][1] += w_in_s[s][w];
			
 
				+				if(w_in_s[s][w] > 0.5)
			
 
				+					nworkers_rounded[s][1]++;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	
			
 
				+/* 	for(s = 0; s < ns; s++) */
			
 
				+/* 		printf("%d: cpus = %d gpus = %d \n", s, nworkers_rounded[s][1], nworkers_rounded[s][0]); */
			
 
				+
			
 
				+	if(!do_size)
			
 
				+		_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
			
 
				+	else
			
 
				+	{
			
 
				+		int *current_sched_ctxs = sched_ctxs_input == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : sched_ctxs_input;
			
 
				+
			
 
				+		unsigned has_workers = 0;
			
 
				+		for(s = 0; s < ns; s++)
			
 
				+		{
			
 
				+			int nworkers_ctx = sched_ctx_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], 
			
 
				+										 STARPU_ANY_WORKER);
			
 
				+			if(nworkers_ctx != 0)
			
 
				+			{
			
 
				+				has_workers = 1;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		if(has_workers)
			
 
				+			_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
			
 
				+		else
			
 
				+			_lp_distribute_resources_in_ctxs(current_sched_ctxs, ns, 2, nworkers_rounded, nworkers, workers_input, nw);
			
 
				+	}
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+double _find_tmax(double t1, double t2)
			
 
				+{
			
 
				+	return t1 + ((t2 - t1)/2);
			
 
				+}
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.h
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.h
@@ -42,3 +42,9 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 
			
 
				 /* make the first distribution of ressource in contexts by assigning the first x available ressources to each one */
			
 
				 void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers);
			
 
				+
			
 
				+/* place resources in contexts dependig on whether they already have workers or not */
			
 
				+void _lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sched_ctxs, int *workers, unsigned do_size);
			
 
				+
			
 
				+/* dichotomy btw t1 & t2 */
			
 
				+double _find_tmax(double t1, double t2);
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
@@ -75,7 +75,7 @@ unsigned _find_poor_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move)
 
				 	return sched_ctx;
			
 
				 }
			
 
				 
			
 
				-int* _get_first_workers_in_list(int *start, int *workers, int nall_workers,  unsigned *nworkers, enum starpu_archtype arch)
			
 
				+int* _get_first_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_archtype arch)
			
 
				 {
			
 
				 	int *curr_workers = (int*)malloc((*nworkers)*sizeof(int));
			
 
				 
			
@@ -83,6 +83,9 @@ int* _get_first_workers_in_list(int *start, int *workers, int nall_workers,  uns
 
				 	int nfound_workers = 0;
			
 
				 	for(w = 0; w < nall_workers; w++)
			
 
				 	{
			
 
				+		if(nfound_workers >= *nworkers)
			
 
				+			break;
			
 
				+
			
 
				 		worker = workers == NULL ? w : workers[w];
			
 
				 		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
			
 
				 		if(arch == STARPU_ANY_WORKER || curr_arch == arch)
			
@@ -93,8 +96,6 @@ int* _get_first_workers_in_list(int *start, int *workers, int nall_workers,  uns
 
				 				*start = w+1;
			
 
				 			}
			
 
				 		}
			
 
				-		if(nfound_workers == *nworkers)
			
 
				-			break;
			
 
				 	}
			
 
				 	if(nfound_workers < *nworkers)
			
 
				 		*nworkers = nfound_workers;
			
@@ -519,33 +520,6 @@ double _get_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w,
 
				         return -1.0;
			
 
				 }
			
 
				 
			
 
				-/* compute an average value of the cpu/cuda old velocity */
			
 
				-double _get_ref_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
			
 
				-{
			
 
				-	double ref_velocity = 0.0;
			
 
				-	unsigned nw = 0;
			
 
				-
			
 
				-	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				-	int worker;
			
 
				-
			
 
				-	struct starpu_sched_ctx_iterator it;
			
 
				-	if(workers->init_iterator)
			
 
				-		workers->init_iterator(workers, &it);
			
 
				-
			
 
				-	while(workers->has_next(workers, &it))
			
 
				-	{
			
 
				-		worker = workers->get_next(workers, &it);
			
 
				-		if(sc_w->ref_velocity[worker] > 1.0)
			
 
				-		{
			
 
				-			ref_velocity += sc_w->ref_velocity[worker];
			
 
				-			nw++;
			
 
				-		}
			
 
				-	}
			
 
				-	
			
 
				-	if(nw > 0)
			
 
				-		return ref_velocity / nw;
			
 
				-	return -1.0;
			
 
				-}
			
 
				 
			
 
				 /* check if there is a big velocity gap between the contexts */
			
 
				 int _velocity_gap_btw_ctxs()
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.h
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.h
@@ -38,7 +38,7 @@ unsigned _find_poor_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move);
 
				 
			
 
				 int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch);
			
 
				 
			
 
				-int* _get_first_workers_in_list(int *start, int *workers, int nall_workers,  unsigned *nworkers, enum starpu_archtype arch);
			
 
				+int* _get_first_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_archtype arch);
			
 
				 
			
 
				 unsigned _get_potential_nworkers(struct sched_ctx_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_archtype arch);
			
 
				 
			
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
@@ -23,7 +23,7 @@ struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
 
				 
			
 
				 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
			
 
				 static void notify_pushed_task(unsigned sched_ctx, int worker);
			
 
				-static void notify_poped_task(unsigned sched_ctx, int worker, double flops, size_t data_size);
			
 
				+static void notify_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
			
 
				 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
			
 
				 static void notify_idle_end(unsigned sched_ctx, int  worker);
			
 
				 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
			
@@ -209,25 +209,23 @@ void sched_ctx_hypervisor_start_resize(unsigned sched_ctx)
 
				 
			
 
				 static void _print_current_time()
			
 
				 {
			
 
				-/* 	double curr_time = starpu_timing_now(); */
			
 
				-/* 	double elapsed_time = (curr_time - hypervisor.start_executing_time) / 1000000.0; /\* in seconds *\/ */
			
 
				-/* 	fprintf(stdout, "Time: %lf\n", elapsed_time); */
			
 
				-/* 	int i; */
			
 
				-/* 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++) */
			
 
				-/* 	{ */
			
 
				-/* 		if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS) */
			
 
				-/* 		{ */
			
 
				-/* 			struct sched_ctx_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]]; */
			
 
				-
			
 
				-/* 			double cpu_speed = sched_ctx_hypervisor_get_velocity_per_worker_type(sc_w, STARPU_CPU_WORKER); */
			
 
				-/* 			double cuda_speed = sched_ctx_hypervisor_get_velocity_per_worker_type(sc_w, STARPU_CUDA_WORKER); */
			
 
				-/* 			int ncpus = sched_ctx_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CPU_WORKER); */
			
 
				-/* 			int ncuda = sched_ctx_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CUDA_WORKER); */
			
 
				-/* 			cpu_speed = cpu_speed == -1.0 ? 0.0 : cpu_speed; */
			
 
				-/* 			cuda_speed = cuda_speed == -1.0 ? 0.0 : cuda_speed; */
			
 
				-/* 			fprintf(stdout, "%d: cpu_v = %lf cuda_v = %lf ncpus = %d ncuda = %d\n", hypervisor.sched_ctxs[i], cpu_speed, cuda_speed, ncpus, ncuda); */
			
 
				-/* 		} */
			
 
				-/* 	} */
			
 
				+	double curr_time = starpu_timing_now();
			
 
				+	double elapsed_time = (curr_time - hypervisor.start_executing_time) / 1000000.0; /* in seconds */
			
 
				+	fprintf(stdout, "Time: %lf\n", elapsed_time);
			
 
				+	int i;
			
 
				+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				+	{
			
 
				+		if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS)
			
 
				+		{
			
 
				+			struct sched_ctx_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
			
 
				+
			
 
				+			double cpu_speed = sched_ctx_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
			
 
				+			double cuda_speed = sched_ctx_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
			
 
				+			int ncpus = sched_ctx_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CPU_WORKER);
			
 
				+			int ncuda = sched_ctx_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CUDA_WORKER);
			
 
				+			fprintf(stdout, "%d: cpu_v = %lf cuda_v = %lf ncpus = %d ncuda = %d\n", hypervisor.sched_ctxs[i], cpu_speed, cuda_speed, ncpus, ncuda);
			
 
				+		}
			
 
				+	}
			
 
				 	return;
			
 
				 }
			
 
				 
			
@@ -277,7 +275,7 @@ void sched_ctx_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
 
				 	pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 }
			
 
				 
			
 
				-static int _get_first_free_sched_ctx(int *sched_ctxs, unsigned nsched_ctxs)
			
 
				+static int _get_first_free_sched_ctx(int *sched_ctxs, int nsched_ctxs)
			
 
				 {
			
 
				 	int i;
			
 
				 	for(i = 0; i < nsched_ctxs; i++)
			
@@ -318,7 +316,7 @@ void sched_ctx_hypervisor_unregister_ctx(unsigned sched_ctx)
 
				 	unsigned i;
			
 
				 	for(i = 0; i < hypervisor.nsched_ctxs; i++)
			
 
				 	{
			
 
				-		if(hypervisor.sched_ctxs[i] == sched_ctx)
			
 
				+		if(hypervisor.sched_ctxs[i] == (int)sched_ctx)
			
 
				 		{
			
 
				 			hypervisor.sched_ctxs[i] = STARPU_NMAX_SCHED_CTXS;
			
 
				 			break;
			
@@ -384,6 +382,34 @@ double sched_ctx_hypervisor_get_velocity_per_worker_type(struct sched_ctx_hyperv
 
				         return -1.0;
			
 
				 }
			
 
				 
			
 
				+/* compute an average value of the cpu/cuda old velocity */
			
 
				+double _get_ref_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
			
 
				+{
			
 
				+	double ref_velocity = 0.0;
			
 
				+	unsigned nw = 0;
			
 
				+
			
 
				+	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				+	int worker;
			
 
				+
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				+	if(workers->init_iterator)
			
 
				+		workers->init_iterator(workers, &it);
			
 
				+
			
 
				+	while(workers->has_next(workers, &it))
			
 
				+	{
			
 
				+		worker = workers->get_next(workers, &it);
			
 
				+		if(sc_w->ref_velocity[worker] > 1.0)
			
 
				+		{
			
 
				+			ref_velocity += sc_w->ref_velocity[worker];
			
 
				+			nw++;
			
 
				+		}
			
 
				+	}
			
 
				+	
			
 
				+	if(nw > 0)
			
 
				+		return ref_velocity / nw;
			
 
				+	return -1.0;
			
 
				+}
			
 
				+
			
 
				 static int get_ntasks( int *tasks)
			
 
				 {
			
 
				 	int ntasks = 0;
			
@@ -471,11 +497,11 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 
				 	
			
 
				 	double start_time =  starpu_timing_now();
			
 
				 	sender_sc_w->start_time = start_time;
			
 
				-	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
			
 
				+//	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
			
 
				 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
			
 
				 
			
 
				 	receiver_sc_w->start_time = start_time;
			
 
				-	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
			
 
				+//	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
			
 
				 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
			
 
				 }
			
 
				 
			
@@ -486,7 +512,7 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
				 	if(nworkers_to_move > 0 && hypervisor.resize[sender_sched_ctx])// && hypervisor.resize[receiver_sched_ctx])
			
 
				 	{
			
 
				 		_print_current_time();
			
 
				-		int j;
			
 
				+		unsigned j;
			
 
				 		printf("resize ctx %d with %d workers", sender_sched_ctx, nworkers_to_move);
			
 
				 		for(j = 0; j < nworkers_to_move; j++)
			
 
				 			printf(" %d", workers_to_move[j]);
			
@@ -497,7 +523,7 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
				 
			
 
				 		if(now)
			
 
				 		{
			
 
				-			int j;
			
 
				+			unsigned j;
			
 
				 			printf("remove now from ctx %d:", sender_sched_ctx);
			
 
				 			for(j = 0; j < nworkers_to_move; j++)
			
 
				 				printf(" %d", workers_to_move[j]);
			
@@ -518,7 +544,7 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
				 				hypervisor.sched_ctx_w[sender_sched_ctx].resize_ack.acked_workers = (int*)malloc(nworkers_to_move * sizeof(int));
			
 
				 
			
 
				 
			
 
				-				int i;
			
 
				+				unsigned i;
			
 
				 				for(i = 0; i < nworkers_to_move; i++)
			
 
				 				{
			
 
				 					hypervisor.sched_ctx_w[sender_sched_ctx].current_idle_time[workers_to_move[i]] = 0.0;
			
@@ -533,7 +559,7 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
				 			}
			
 
				 		}
			
 
				 		struct sched_ctx_hypervisor_policy_config *new_config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
			
 
				-		int i;
			
 
				+		unsigned i;
			
 
				 		for(i = 0; i < nworkers_to_move; i++)
			
 
				 			new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
			
 
				 
			
@@ -546,14 +572,14 @@ void sched_ctx_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned
 
				 	if(nworkers_to_add > 0 && hypervisor.resize[sched_ctx])
			
 
				 	{
			
 
				 		_print_current_time();
			
 
				-		int j;
			
 
				+		unsigned j;
			
 
				 		printf("add to ctx %d:", sched_ctx);
			
 
				 		for(j = 0; j < nworkers_to_add; j++)
			
 
				 			printf(" %d", workers_to_add[j]);
			
 
				 		printf("\n");
			
 
				 		starpu_sched_ctx_add_workers(workers_to_add, nworkers_to_add, sched_ctx);
			
 
				 		struct sched_ctx_hypervisor_policy_config *new_config = sched_ctx_hypervisor_get_config(sched_ctx);
			
 
				-		int i;
			
 
				+		unsigned i;
			
 
				 		for(i = 0; i < nworkers_to_add; i++)
			
 
				 			new_config->max_idle[workers_to_add[i]] = new_config->max_idle[workers_to_add[i]] != MAX_IDLE_TIME ? new_config->max_idle[workers_to_add[i]] :  new_config->new_workers_max_idle;
			
 
				 
			
@@ -571,12 +597,12 @@ void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove,
 
				 	if(nworkers_to_remove > 0 && hypervisor.resize[sched_ctx] && hypervisor.allow_remove[sched_ctx])
			
 
				 	{
			
 
				 		_print_current_time();
			
 
				-		int nworkers=0;
			
 
				+		unsigned nworkers = 0;
			
 
				 		int workers[nworkers_to_remove];
			
 
				 
			
 
				 		if(now)
			
 
				 		{
			
 
				-			int j;
			
 
				+			unsigned j;
			
 
				 			printf("remove explicitley now from ctx %d:", sched_ctx);
			
 
				 			for(j = 0; j < nworkers_to_remove; j++)
			
 
				 				printf(" %d", workers_to_remove[j]);
			
@@ -587,7 +613,7 @@ void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove,
 
				 		else
			
 
				 		{
			
 
				 			printf("try to remove from ctx %d: ", sched_ctx);
			
 
				-			int j;
			
 
				+			unsigned j;
			
 
				 			for(j = 0; j < nworkers_to_remove; j++)
			
 
				 				printf(" %d", workers_to_remove[j]);
			
 
				 			printf("\n");
			
@@ -596,14 +622,14 @@ void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove,
 
				 			if(ret != EBUSY)
			
 
				 			{
			
 
				 
			
 
				-				int i;
			
 
				+				unsigned i;
			
 
				 				for(i = 0; i < nworkers_to_remove; i++)
			
 
				 					if(starpu_sched_ctx_contains_worker(workers_to_remove[i], sched_ctx))
			
 
				 						workers[nworkers++] = workers_to_remove[i];
			
 
				 
			
 
				 				hypervisor.sched_ctx_w[sched_ctx].resize_ack.receiver_sched_ctx = -1;
			
 
				 				hypervisor.sched_ctx_w[sched_ctx].resize_ack.moved_workers = (int*)malloc(nworkers_to_remove * sizeof(int));
			
 
				-				hypervisor.sched_ctx_w[sched_ctx].resize_ack.nmoved_workers = nworkers;
			
 
				+				hypervisor.sched_ctx_w[sched_ctx].resize_ack.nmoved_workers = (int)nworkers;
			
 
				 				hypervisor.sched_ctx_w[sched_ctx].resize_ack.acked_workers = (int*)malloc(nworkers_to_remove * sizeof(int));
			
 
				 
			
 
				 
			
@@ -638,7 +664,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
				 			struct sched_ctx_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
			
 
				 			pthread_mutex_lock(&sc_w->mutex);
			
 
				 			unsigned only_remove = 0;
			
 
				-			if(sc_w->resize_ack.receiver_sched_ctx == -1 && hypervisor.sched_ctxs[i] != sched_ctx &&
			
 
				+			if(sc_w->resize_ack.receiver_sched_ctx == -1 && hypervisor.sched_ctxs[i] != (int)sched_ctx &&
			
 
				 			   sc_w->resize_ack.nmoved_workers > 0 && starpu_sched_ctx_contains_worker(worker, hypervisor.sched_ctxs[i]))
			
 
				 			{
			
 
				 				int j;
			
@@ -650,7 +676,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
				 					}
			
 
				 			}
			
 
				 			if(only_remove ||
			
 
				-			   (sc_w->resize_ack.receiver_sched_ctx != -1 && sc_w->resize_ack.receiver_sched_ctx == sched_ctx))
			
 
				+			   (sc_w->resize_ack.receiver_sched_ctx != -1 && sc_w->resize_ack.receiver_sched_ctx == (int)sched_ctx))
			
 
				 			{
			
 
				 				resize_ack = &sc_w->resize_ack;
			
 
				 				sender_sched_ctx = hypervisor.sched_ctxs[i];
			
@@ -795,22 +821,22 @@ static void notify_pushed_task(unsigned sched_ctx, int worker)
 
				 }
			
 
				 
			
 
				 /* notifies the hypervisor that a task was poped from the queue of the worker */
			
 
				-static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops, size_t data_size)
			
 
				+static void notify_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint)
			
 
				 {
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
			
 
				-	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += task->flops;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
			
 
				-	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
			
 
				-	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += task->flops;
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
			
 
				 
			
 
				 	if(hypervisor.resize[sched_ctx])
			
 
				 	{	
			
 
				 		if(hypervisor.policy.handle_poped_task)
			
 
				-			hypervisor.policy.handle_poped_task(sched_ctx, worker);
			
 
				+			hypervisor.policy.handle_poped_task(sched_ctx, worker, task, footprint);
			
 
				 	}
			
 
				 	_ack_resize_completed(sched_ctx, worker);
			
 
				-	if(hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker] % 100 == 0)
			
 
				+	if(hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker] % 200 == 0)
			
 
				 		_print_current_time();
			
 
				 }
			
 
				 
			
@@ -820,7 +846,7 @@ static void notify_post_exec_hook(unsigned sched_ctx, int task_tag)
 
				 	STARPU_ASSERT(task_tag > 0);
			
 
				 
			
 
				 	unsigned conf_sched_ctx;
			
 
				-	int i;
			
 
				+	unsigned i;
			
 
				 	pthread_mutex_lock(&act_hypervisor_mutex);
			
 
				 	unsigned ns = hypervisor.nsched_ctxs;
			
 
				 	pthread_mutex_unlock(&act_hypervisor_mutex);
			
@@ -887,10 +913,10 @@ static void notify_delete_context(unsigned sched_ctx)
 
				 void sched_ctx_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
			
 
				 {
			
 
				 	pthread_mutex_lock(&act_hypervisor_mutex);
			
 
				-	int curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : nsched_ctxs;
			
 
				+	unsigned curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : nsched_ctxs;
			
 
				 	int *curr_sched_ctxs = sched_ctxs == NULL ? hypervisor.sched_ctxs : sched_ctxs;
			
 
				 	pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				-	int s;
			
 
				+	unsigned s;
			
 
				 	for(s = 0; s < curr_nsched_ctxs; s++)
			
 
				 		hypervisor.resize[curr_sched_ctxs[s]] = 1;
			
 
				 
			
@@ -945,3 +971,15 @@ void sched_ctx_hypervisor_free_size_req(void)
 
				 		hypervisor.sr = NULL;
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+double sched_ctx_hypervisor_get_velocity(struct sched_ctx_hypervisor_wrapper *sc_w, enum starpu_archtype arch)
			
 
				+{
			
 
				+
			
 
				+	double velocity = sched_ctx_hypervisor_get_velocity_per_worker_type(sc_w, arch);
			
 
				+	if(velocity == -1.0)
			
 
				+		velocity = _get_ref_velocity_per_worker_type(sc_w, arch);
			
 
				+	if(velocity == -1.0)
			
 
				+		velocity = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
			
 
				+       
			
 
				+	return velocity;
			
 
				+}
			
--- a/socl/examples/basic/basic.c
+++ b/socl/examples/basic/basic.c
@@ -175,9 +175,9 @@ int main(int UNUSED(argc), char** UNUSED(argv)) {
 
				 
			
 
				    printf("Data...\n");
			
 
				    {
			
 
				-      int i;
			
 
				-      for (i=0; i<SIZE; i++) {
			
 
				-        printf("%f ", d[i]);
			
 
				+      int j;
			
 
				+      for (j=0; j<SIZE; j++) {
			
 
				+        printf("%f ", d[j]);
			
 
				       }
			
 
				       printf("\n");
			
 
				    }
			
--- a/socl/examples/matmul/matmul.c
+++ b/socl/examples/matmul/matmul.c
@@ -25,7 +25,7 @@
 
				 #include <sys/time.h>

			
 
				 

			
 
				 #define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)

			
 
				-#define check(exp) do { cl_int err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0)

			
 
				+#define check(exp) do { err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0)

			
 
				 #define check2(exp) exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }

			
 
				 

			
 
				 // Thread block size

			
@@ -38,7 +38,7 @@
 
				 #define HA (512L * BLOCK_SIZE) // Matrix A height

			
 
				 #define WB (128L * BLOCK_SIZE) // Matrix B width

			
 
				 #define HB WA  // Matrix B height

			
 
				-#define WC WB  // Matrix C width 

			
 
				+#define WC WB  // Matrix C width

			
 
				 #define HC HA  // Matrix C height

			
 
				 #define BLOCKS (HA / WORK_SIZE)

			
 
				 

			
@@ -236,7 +236,7 @@ int main(int argc, const char** argv) {
 
				 	for (p=0; p<platform_count; p++) {

			
 
				 		cl_platform_id platform = platforms[p];

			
 
				 

			
 
				-		cl_int err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);

			
 
				+		err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);

			
 
				 		if (err == CL_DEVICE_NOT_FOUND) {

			
 
				 			devs[p] = 0;

			
 
				 			continue;

			
@@ -260,7 +260,7 @@ int main(int argc, const char** argv) {
 
				 		cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};

			
 
				 		check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));

			
 
				 

			
 
				-		for(i = 0; i < devs[p]; ++i) 

			
 
				+		for(i = 0; i < devs[p]; ++i)

			
 
				 		{

			
 
				 			cl_device_id device = devices[p][i];

			
 
				 			char name[2048];

			
@@ -281,7 +281,7 @@ int main(int argc, const char** argv) {
 
				 

			
 
				 	cl_kernel multiplicationKernel[platform_count];

			
 
				 

			
 
				-	printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n", 

			
 
				+	printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n",

			
 
				 			(unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);

			
 
				 

			
 
				 	// allocate host memory for matrices A, B and C

			
@@ -333,34 +333,34 @@ int main(int argc, const char** argv) {
 
				 		}

			
 
				 	}

			
 
				 

			
 
				-	for(i=0; i < BLOCKS; ++i) 

			
 
				+	for(i=0; i < BLOCKS; ++i)

			
 
				 	{

			
 
				 		int d = i % device_count;

			
 
				-		cl_uint p = 0;

			
 
				+		cl_uint platform = 0;

			
 
				 

			
 
				 		// determine device platform

			
 
				 		int dev = d;

			
 
				-		for (p = 0; p < platform_count; p++) {

			
 
				-			if ((cl_int)(dev - devs[p]) < 0)

			
 
				+		for (platform = 0; platform < platform_count; platform++) {

			
 
				+			if ((cl_int)(dev - devs[platform]) < 0)

			
 
				 				break;

			
 
				-			dev -= devs[p];

			
 
				+			dev -= devs[platform];

			
 
				 		}

			
 
				 

			
 
				-		workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;        

			
 
				+		workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;

			
 
				 

			
 
				-		check2(d_A[i] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));

			
 
				-		check2(d_C[i] = clCreateBuffer(ctx[p], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));

			
 
				+		check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));

			
 
				+		check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));

			
 
				 

			
 
				-		check(clSetKernelArg(multiplicationKernel[p], 0, sizeof(cl_int), &workSize[i]));

			
 
				-		check(clSetKernelArg(multiplicationKernel[p], 1, sizeof(cl_int), &workSize[i]));

			
 
				-		check(clSetKernelArg(multiplicationKernel[p], 2, sizeof(cl_int), &workSize[i]));

			
 
				-		check(clSetKernelArg(multiplicationKernel[p], 3, sizeof(cl_mem), (void *) &d_A[i]));

			
 
				-		check(clSetKernelArg(multiplicationKernel[p], 4, sizeof(cl_mem), (void *) &d_B[d]));

			
 
				-		check(clSetKernelArg(multiplicationKernel[p], 5, sizeof(cl_mem), (void *) &d_C[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i]));

			
 
				 

			
 
				 		size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])};

			
 
				 

			
 
				-		check(clEnqueueNDRangeKernel(commandQueue[p][dev], multiplicationKernel[p], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));

			
 
				+		check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));

			
 
				 

			
 
				 		// Non-blocking copy of result from device to host

			
 
				 		 cqs[i] = commandQueue[p][dev];

			
@@ -386,7 +386,7 @@ int main(int argc, const char** argv) {
 
				 	double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;

			
 
				 	double gflops = 1.0e-9 * dNumOps/dSeconds;

			
 
				 

			
 
				-	printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n", 

			
 
				+	printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",

			
 
				 			gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);

			
 
				 

			
 
				 	// compute reference solution

			
@@ -405,12 +405,12 @@ int main(int argc, const char** argv) {
 
				 		free(reference);

			
 
				 	}

			
 
				 

			
 
				-	for(i = 0; i < BLOCKS; i++) 

			
 
				+	for(i = 0; i < BLOCKS; i++)

			
 
				 	{

			
 
				 		clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL);

			
 
				 	}

			
 
				 

			
 
				-	for(i = 0; i < BLOCKS; i++) 

			
 
				+	for(i = 0; i < BLOCKS; i++)

			
 
				 	{

			
 
				 		clFinish(cqs[i]);

			
 
				 	}

			
@@ -419,7 +419,7 @@ int main(int argc, const char** argv) {
 
				 		clReleaseMemObject(d_B[i]);

			
 
				 	}

			
 
				 

			
 
				-	for(i = 0; i < BLOCKS; i++) 

			
 
				+	for(i = 0; i < BLOCKS; i++)

			
 
				 	{

			
 
				 		clReleaseMemObject(d_A[i]);

			
 
				 		clReleaseMemObject(d_C[i]);

			
@@ -436,7 +436,7 @@ int main(int argc, const char** argv) {
 
				 		check(clReleaseProgram(program[p]));

			
 
				 		check(clReleaseContext(ctx[p]));

			
 
				 		cl_uint k;

			
 
				-		for(k = 0; k < devs[p]; ++k) 

			
 
				+		for(k = 0; k < devs[p]; ++k)

			
 
				 		{

			
 
				 			check(clReleaseCommandQueue(commandQueue[p][k]));

			
 
				 		}

			
@@ -460,7 +460,7 @@ void printDiff(TYPE *data1, TYPE *data2, int width, int height, int listLength,
 
				 		for (i = 0; i < width; i++) {

			
 
				 			k = j * width + i;

			
 
				 			float diff = fabs(data1[k] - data2[k]);

			
 
				-			if (diff > listTol) {                

			
 
				+			if (diff > listTol) {

			
 
				 				if (error_count < listLength) {

			
 
				 					printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], diff);

			
 
				 				}

			
@@ -493,4 +493,3 @@ void computeReference(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, un
 
				 			C[i * wB + j] = (TYPE)sum;

			
 
				 		}

			
 
				 }

			
 
				-

			
--- a/socl/examples/testmap/testmap.c
+++ b/socl/examples/testmap/testmap.c
@@ -186,9 +186,9 @@ int main(int UNUSED(argc), char** UNUSED(argv)) {
 
				 
			
 
				    printf("Data...\n");
			
 
				    {
			
 
				-      int i;
			
 
				-      for (i=0; i<SIZE; i++) {
			
 
				-        printf("%f ", d[i]);
			
 
				+      int j;
			
 
				+      for (j=0; j<SIZE; j++) {
			
 
				+        printf("%f ", d[j]);
			
 
				       }
			
 
				       printf("\n");
			
 
				    }
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -367,49 +367,49 @@ do {										\
 
				 #else // !STARPU_USE_FXT
			
 
				 
			
 
				 /* Dummy macros in case FxT is disabled */
			
 
				-#define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0);
			
 
				-#define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0);
			
 
				-#define _STARPU_TRACE_WORKER_INIT_END		do {} while(0);
			
 
				-#define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0);
			
 
				-#define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0);
			
 
				-#define _STARPU_TRACE_JOB_PUSH(task, prio)	do {} while(0);
			
 
				-#define _STARPU_TRACE_JOB_POP(task, prio)	do {} while(0);
			
 
				-#define _STARPU_TRACE_UPDATE_TASK_CNT(counter)	do {} while(0);
			
 
				-#define _STARPU_TRACE_START_FETCH_INPUT(job)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_FETCH_INPUT(job)	do {} while(0);
			
 
				-#define _STARPU_TRACE_START_PUSH_OUTPUT(job)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_PUSH_OUTPUT(job)	do {} while(0);
			
 
				-#define _STARPU_TRACE_TAG(tag, job)	do {} while(0);
			
 
				-#define _STARPU_TRACE_TAG_DEPS(a, b)	do {} while(0);
			
 
				-#define _STARPU_TRACE_TASK_DEPS(a, b)		do {} while(0);
			
 
				-#define _STARPU_TRACE_GHOST_TASK_DEPS(a, b)	do {} while(0);
			
 
				-#define _STARPU_TRACE_TASK_DONE(a)		do {} while(0);
			
 
				-#define _STARPU_TRACE_TAG_DONE(a)		do {} while(0);
			
 
				-#define _STARPU_TRACE_DATA_COPY(a, b, c)		do {} while(0);
			
 
				-#define _STARPU_TRACE_START_DRIVER_COPY(a,b,c,d)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_DRIVER_COPY(a,b,c,d)	do {} while(0);
			
 
				-#define _STARPU_TRACE_START_DRIVER_COPY_ASYNC(a,b)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_DRIVER_COPY_ASYNC(a,b)	do {} while(0);
			
 
				-#define _STARPU_TRACE_WORK_STEALING(a, b)	do {} while(0);
			
 
				-#define _STARPU_TRACE_WORKER_DEINIT_START	do {} while(0);
			
 
				-#define _STARPU_TRACE_WORKER_DEINIT_END(a)	do {} while(0);
			
 
				-#define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0);
			
 
				-#define _STARPU_TRACE_WORKER_SLEEP_END		do {} while(0);
			
 
				-#define _STARPU_TRACE_USER_DEFINED_START		do {} while(0);
			
 
				-#define _STARPU_TRACE_USER_DEFINED_END		do {} while(0);
			
 
				-#define _STARPU_TRACE_START_ALLOC(memnode)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_ALLOC(memnode)		do {} while(0);
			
 
				-#define _STARPU_TRACE_START_ALLOC_REUSE(a)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_ALLOC_REUSE(a)		do {} while(0);
			
 
				-#define _STARPU_TRACE_START_MEMRECLAIM(memnode)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_MEMRECLAIM(memnode)	do {} while(0);
			
 
				-#define _STARPU_TRACE_START_PROGRESS(memnode)	do {} while(0);
			
 
				-#define _STARPU_TRACE_END_PROGRESS(memnode)	do {} while(0);
			
 
				-#define _STARPU_TRACE_USER_EVENT(code)		do {} while(0);
			
 
				-#define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0);
			
 
				-#define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0);
			
 
				+#define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_INIT_END		do {} while(0)
			
 
				+#define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
			
 
				+#define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
			
 
				+#define _STARPU_TRACE_JOB_PUSH(task, prio)	do {} while(0)
			
 
				+#define _STARPU_TRACE_JOB_POP(task, prio)	do {} while(0)
			
 
				+#define _STARPU_TRACE_UPDATE_TASK_CNT(counter)	do {} while(0)
			
 
				+#define _STARPU_TRACE_START_FETCH_INPUT(job)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_FETCH_INPUT(job)	do {} while(0)
			
 
				+#define _STARPU_TRACE_START_PUSH_OUTPUT(job)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_PUSH_OUTPUT(job)	do {} while(0)
			
 
				+#define _STARPU_TRACE_TAG(tag, job)	do {} while(0)
			
 
				+#define _STARPU_TRACE_TAG_DEPS(a, b)	do {} while(0)
			
 
				+#define _STARPU_TRACE_TASK_DEPS(a, b)		do {} while(0)
			
 
				+#define _STARPU_TRACE_GHOST_TASK_DEPS(a, b)	do {} while(0)
			
 
				+#define _STARPU_TRACE_TASK_DONE(a)		do {} while(0)
			
 
				+#define _STARPU_TRACE_TAG_DONE(a)		do {} while(0)
			
 
				+#define _STARPU_TRACE_DATA_COPY(a, b, c)		do {} while(0)
			
 
				+#define _STARPU_TRACE_START_DRIVER_COPY(a,b,c,d)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_DRIVER_COPY(a,b,c,d)	do {} while(0)
			
 
				+#define _STARPU_TRACE_START_DRIVER_COPY_ASYNC(a,b)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_DRIVER_COPY_ASYNC(a,b)	do {} while(0)
			
 
				+#define _STARPU_TRACE_WORK_STEALING(a, b)	do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_DEINIT_START	do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_DEINIT_END(a)	do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_SLEEP_END		do {} while(0)
			
 
				+#define _STARPU_TRACE_USER_DEFINED_START		do {} while(0)
			
 
				+#define _STARPU_TRACE_USER_DEFINED_END		do {} while(0)
			
 
				+#define _STARPU_TRACE_START_ALLOC(memnode)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_ALLOC(memnode)		do {} while(0)
			
 
				+#define _STARPU_TRACE_START_ALLOC_REUSE(a)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_ALLOC_REUSE(a)		do {} while(0)
			
 
				+#define _STARPU_TRACE_START_MEMRECLAIM(memnode)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_MEMRECLAIM(memnode)	do {} while(0)
			
 
				+#define _STARPU_TRACE_START_PROGRESS(memnode)	do {} while(0)
			
 
				+#define _STARPU_TRACE_END_PROGRESS(memnode)	do {} while(0)
			
 
				+#define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
			
 
				+#define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
			
 
				+#define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
			
 
				 
			
 
				 #endif // STARPU_USE_FXT
			
 
				 
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -141,10 +141,7 @@ void starpu_tag_remove(starpu_tag_t id)
 
				 
			
 
				 #ifdef HAVE_AYUDAME_H
			
 
				 	if (AYU_event)
			
 
				-	{
			
 
				-		int id = -1;
			
 
				 		AYU_event(AYU_REMOVETASK, id + AYUDAME_OFFSET, NULL);
			
 
				-	}
			
 
				 #endif
			
 
				 
			
 
				 	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
			
@@ -154,7 +151,11 @@ void starpu_tag_remove(starpu_tag_t id)
 
				 
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
 
				 
			
 
				-	if (entry)_starpu_tag_free(entry->tag);
			
 
				+	if (entry)
			
 
				+	{
			
 
				+		_starpu_tag_free(entry->tag);
			
 
				+		free(entry);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 void _starpu_tag_clear(void)
			
@@ -171,6 +172,7 @@ void _starpu_tag_clear(void)
 
				 	{
			
 
				 		HASH_DEL(tag_htbl, entry);
			
 
				 		_starpu_tag_free(entry->tag);
			
 
				+		free(entry);
			
 
				 	}
			
 
				 
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -219,7 +219,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 	{
			
 
				 		_starpu_sched_post_exec_hook(task);
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				-		starpu_sched_ctx_call_poped_task_cb(workerid, task->sched_ctx, task->flops, data_size);
			
 
				+		_starpu_sched_ctx_call_poped_task_cb(workerid, task, data_size, j->footprint);
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 	}
			
 
				 
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -1037,12 +1037,12 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
 
				-void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size)
			
 
				+void _starpu_sched_ctx_call_poped_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint)
			
 
				 {
			
 
				-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				-	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				+	if(sched_ctx != NULL && task->sched_ctx != 0 && task->sched_ctx != STARPU_NMAX_SCHED_CTXS
			
 
				 	   && sched_ctx->perf_counters != NULL)
			
 
				-		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops, data_size);
			
 
				+		sched_ctx->perf_counters->notify_poped_task(task->sched_ctx, workerid, task, data_size, footprint);
			
 
				 }
			
 
				 
			
 
				 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
			
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -30,7 +30,7 @@
 
				 
			
 
				 
			
 
				 /* used when changes (delete, modify) are applyed to contexts */
			
 
				-extern _starpu_pthread_mutex_t changing_ctx_mutex[];
			
 
				+//extern _starpu_pthread_mutex_t _changing_ctx_mutex[];
			
 
				 
			
 
				 struct _starpu_sched_ctx
			
 
				 {
			
@@ -141,6 +141,11 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
				 /* Check if the worker belongs to another sched_ctx */
			
 
				 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
			
 
				 
			
 
				+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+/* Notifies the hypervisor that a tasks was poped from the workers' list */
			
 
				+void _starpu_sched_ctx_call_poped_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);
			
 
				+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+
			
 
				 #if defined(_MSC_VER) || defined(STARPU_SIMGRID)
			
 
				 _starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
			
 
				 #endif
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -23,13 +23,13 @@
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 #include <datawizard/memory_manager.h>
			
 
				 
			
 
				-static size_t malloc_align = sizeof(void*);
			
 
				+static size_t _malloc_align = sizeof(void*);
			
 
				 
			
 
				 void starpu_malloc_set_align(size_t align)
			
 
				 {
			
 
				 	STARPU_ASSERT_MSG(!(align & (align - 1)), "Alignment given to starpu_malloc_set_align must be a power of two");
			
 
				-	if (malloc_align < align)
			
 
				-		malloc_align = align;
			
 
				+	if (_malloc_align < align)
			
 
				+		_malloc_align = align;
			
 
				 }
			
 
				 
			
 
				 #if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
			
@@ -81,98 +81,124 @@ static struct starpu_codelet malloc_pinned_cl =
 
				 };
			
 
				 #endif
			
 
				 
			
 
				-int starpu_malloc(void **A, size_t dim)
			
 
				+int starpu_malloc_flags(void **A, size_t dim, int flags)
			
 
				 {
			
 
				 	int ret=0;
			
 
				 
			
 
				-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				-		return -EDEADLK;
			
 
				-
			
 
				 	STARPU_ASSERT(A);
			
 
				 
			
 
				+	if (flags & STARPU_MALLOC_COUNT)
			
 
				+	{
			
 
				+		if (_starpu_memory_manager_can_allocate_size(dim, 0) == 0)
			
 
				+		{
			
 
				+			size_t freed;
			
 
				+			size_t reclaim = 2 * dim;
			
 
				+			_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", reclaim);
			
 
				+			_STARPU_TRACE_START_MEMRECLAIM(0);
			
 
				+			freed = _starpu_memory_reclaim_generic(0, 0, reclaim);
			
 
				+			_STARPU_TRACE_END_MEMRECLAIM(0);
			
 
				+			if (freed < dim)
			
 
				+			{
			
 
				+				// We could not reclaim enough memory
			
 
				+				*A = NULL;
			
 
				+				return -ENOMEM;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (_starpu_can_submit_cuda_task())
			
 
				+	if (flags & STARPU_MALLOC_PINNED)
			
 
				 	{
			
 
				+		if (_starpu_can_submit_cuda_task())
			
 
				+		{
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				-		cudaError_t cures;
			
 
				-		cures = cudaHostAlloc(A, dim, cudaHostAllocPortable);
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+			cudaError_t cures;
			
 
				+			cures = cudaHostAlloc(A, dim, cudaHostAllocPortable);
			
 
				+			if (STARPU_UNLIKELY(cures))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+			goto end;
			
 
				 #else
			
 
				-		int push_res;
			
 
				+			int push_res;
			
 
				 
			
 
				-		struct malloc_pinned_codelet_struct s =
			
 
				-		{
			
 
				-			.ptr = A,
			
 
				-			.dim = dim
			
 
				-		};
			
 
				+			if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				+				return -EDEADLK;
			
 
				+
			
 
				+			struct malloc_pinned_codelet_struct s =
			
 
				+			{
			
 
				+				.ptr = A,
			
 
				+				.dim = dim
			
 
				+			};
			
 
				 
			
 
				-                malloc_pinned_cl.where = STARPU_CUDA;
			
 
				-		struct starpu_task *task = starpu_task_create();
			
 
				-		task->callback_func = NULL;
			
 
				-		task->cl = &malloc_pinned_cl;
			
 
				-		task->cl_arg = &s;
			
 
				+			malloc_pinned_cl.where = STARPU_CUDA;
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+			task->callback_func = NULL;
			
 
				+			task->cl = &malloc_pinned_cl;
			
 
				+			task->cl_arg = &s;
			
 
				 
			
 
				-		task->synchronous = 1;
			
 
				+			task->synchronous = 1;
			
 
				 
			
 
				-		_starpu_exclude_task_from_dag(task);
			
 
				+			_starpu_exclude_task_from_dag(task);
			
 
				 
			
 
				-		push_res = _starpu_task_submit_internally(task);
			
 
				-		STARPU_ASSERT(push_res != -ENODEV);
			
 
				+			push_res = _starpu_task_submit_internally(task);
			
 
				+			STARPU_ASSERT(push_res != -ENODEV);
			
 
				+			goto end;
			
 
				 #endif /* HAVE_CUDA_MEMCPY_PEER */
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				-	}
			
 
				-//	else if (_starpu_can_submit_opencl_task())
			
 
				-//	{
			
 
				+		}
			
 
				+//		else if (_starpu_can_submit_opencl_task())
			
 
				+//		{
			
 
				 //#ifdef STARPU_USE_OPENCL
			
 
				-//		int push_res;
			
 
				+//			int push_res;
			
 
				 //
			
 
				-//		struct malloc_pinned_codelet_struct s =
			
 
				-//		{
			
 
				-//			.ptr = A,
			
 
				-//			.dim = dim
			
 
				-//		};
			
 
				+//			if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				+//				return -EDEADLK;
			
 
				 //
			
 
				-//                malloc_pinned_cl.where = STARPU_OPENCL;
			
 
				-//		struct starpu_task *task = starpu_task_create();
			
 
				+//			struct malloc_pinned_codelet_struct s =
			
 
				+//				{
			
 
				+//					.ptr = A,
			
 
				+//					.dim = dim
			
 
				+//				};
			
 
				+//
			
 
				+//			malloc_pinned_cl.where = STARPU_OPENCL;
			
 
				+//			struct starpu_task *task = starpu_task_create();
			
 
				 //			task->callback_func = NULL;
			
 
				 //			task->cl = &malloc_pinned_cl;
			
 
				 //			task->cl_arg = &s;
			
 
				+//			task->synchronous = 1;
			
 
				 //
			
 
				-//		task->synchronous = 1;
			
 
				-//
			
 
				-//		_starpu_exclude_task_from_dag(task);
			
 
				+//			_starpu_exclude_task_from_dag(task);
			
 
				 //
			
 
				-//		push_res = _starpu_task_submit_internally(task);
			
 
				-//		STARPU_ASSERT(push_res != -ENODEV);
			
 
				+//			push_res = _starpu_task_submit_internally(task);
			
 
				+//			STARPU_ASSERT(push_res != -ENODEV);
			
 
				+//			goto end;
			
 
				 //#endif /* STARPU_USE_OPENCL */
			
 
				-//        }
			
 
				-        else
			
 
				+//		}
			
 
				+	}
			
 
				 #endif /* STARPU_SIMGRID */
			
 
				-	{
			
 
				+
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-		if (malloc_align != sizeof(void*))
			
 
				+	if (_malloc_align != sizeof(void*))
			
 
				+	{
			
 
				+		if (posix_memalign(A, _malloc_align, dim))
			
 
				 		{
			
 
				-			if (posix_memalign(A, malloc_align, dim))
			
 
				-			{
			
 
				-				ret = -ENOMEM;
			
 
				-				*A = NULL;
			
 
				-			}
			
 
				+			ret = -ENOMEM;
			
 
				+			*A = NULL;
			
 
				 		}
			
 
				-		else
			
 
				+	}
			
 
				+	else
			
 
				 #elif defined(STARPU_HAVE_MEMALIGN)
			
 
				-		if (malloc_align != sizeof(void*))
			
 
				+		if (_malloc_align != sizeof(void*))
			
 
				 		{
			
 
				-			*A = memalign(malloc_align, dim);
			
 
				+			*A = memalign(_malloc_align, dim);
			
 
				 		}
			
 
				 		else
			
 
				 #endif /* STARPU_HAVE_POSIX_MEMALIGN */
			
 
				 		{
			
 
				 			*A = malloc(dim);
			
 
				 		}
			
 
				-	}
			
 
				 
			
 
				+end:
			
 
				 	if (ret == 0)
			
 
				 	{
			
 
				 		STARPU_ASSERT(*A);
			
@@ -181,6 +207,11 @@ int starpu_malloc(void **A, size_t dim)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+int starpu_malloc(void **A, size_t dim)
			
 
				+{
			
 
				+	return starpu_malloc_flags(A, dim, STARPU_MALLOC_PINNED);
			
 
				+}
			
 
				+
			
 
				 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 {
			
@@ -218,100 +249,87 @@ static struct starpu_codelet free_pinned_cl =
 
				 };
			
 
				 #endif
			
 
				 
			
 
				-int starpu_free(void *A)
			
 
				+int starpu_free_flags(void *A, size_t dim, int flags)
			
 
				 {
			
 
				-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				-		return -EDEADLK;
			
 
				+	if (flags & STARPU_MALLOC_COUNT)
			
 
				+	{
			
 
				+		_starpu_memory_manager_deallocate_size(dim, 0);
			
 
				+	}
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	if (_starpu_can_submit_cuda_task())
			
 
				+	if (flags & STARPU_MALLOC_PINNED)
			
 
				 	{
			
 
				+		if (_starpu_can_submit_cuda_task())
			
 
				+		{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				 #ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				-	if (!_starpu_is_initialized())
			
 
				-	{
			
 
				+			if (!_starpu_is_initialized())
			
 
				+			{
			
 
				 #endif
			
 
				-		/* This is especially useful when starpu_free is called from
			
 
				- 		 * the GCC-plugin. starpu_shutdown will probably have already
			
 
				-		 * been called, so we will not be able to submit a task. */
			
 
				-		cudaError_t err = cudaFreeHost(A);
			
 
				-		if (STARPU_UNLIKELY(err))
			
 
				-			STARPU_CUDA_REPORT_ERROR(err);
			
 
				+				/* This is especially useful when starpu_free is called from
			
 
				+				 * the GCC-plugin. starpu_shutdown will probably have already
			
 
				+				 * been called, so we will not be able to submit a task. */
			
 
				+				cudaError_t err = cudaFreeHost(A);
			
 
				+				if (STARPU_UNLIKELY(err))
			
 
				+					STARPU_CUDA_REPORT_ERROR(err);
			
 
				+				return 0;
			
 
				 #ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		int push_res;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				int push_res;
			
 
				 
			
 
				-                free_pinned_cl.where = STARPU_CUDA;
			
 
				-		struct starpu_task *task = starpu_task_create();
			
 
				-		task->callback_func = NULL;
			
 
				-		task->cl = &free_pinned_cl;
			
 
				-		task->cl_arg = A;
			
 
				+				if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				+					return -EDEADLK;
			
 
				 
			
 
				-		task->synchronous = 1;
			
 
				+				free_pinned_cl.where = STARPU_CUDA;
			
 
				+				struct starpu_task *task = starpu_task_create();
			
 
				+				task->callback_func = NULL;
			
 
				+				task->cl = &free_pinned_cl;
			
 
				+				task->cl_arg = A;
			
 
				+				task->synchronous = 1;
			
 
				 
			
 
				-		_starpu_exclude_task_from_dag(task);
			
 
				+				_starpu_exclude_task_from_dag(task);
			
 
				 
			
 
				-		push_res = _starpu_task_submit_internally(task);
			
 
				-		STARPU_ASSERT(push_res != -ENODEV);
			
 
				-	}
			
 
				-#endif
			
 
				+				push_res = _starpu_task_submit_internally(task);
			
 
				+				STARPU_ASSERT(push_res != -ENODEV);
			
 
				+				return 0;
			
 
				+			}
			
 
				+#endif /* HAVE_CUDA_MEMCPY_PEER */
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+		}
			
 
				 //	else if (_starpu_can_submit_opencl_task())
			
 
				 //	{
			
 
				 //#ifdef STARPU_USE_OPENCL
			
 
				 //		int push_res;
			
 
				 //
			
 
				+//		if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				+//			return -EDEADLK;
			
 
				+//
			
 
				 //                free_pinned_cl.where = STARPU_OPENCL;
			
 
				 //		struct starpu_task *task = starpu_task_create();
			
 
				-//			task->callback_func = NULL;
			
 
				-//			task->cl = &free_pinned_cl;
			
 
				-//			task->cl_arg = A;
			
 
				-//
			
 
				+//		task->callback_func = NULL;
			
 
				+//		task->cl = &free_pinned_cl;
			
 
				+//		task->cl_arg = A;
			
 
				 //		task->synchronous = 1;
			
 
				 //
			
 
				 //		_starpu_exclude_task_from_dag(task);
			
 
				 //
			
 
				 //		push_res = starpu_task_submit(task);
			
 
				 //		STARPU_ASSERT(push_res != -ENODEV);
			
 
				-//#endif
			
 
				+//		return 0;
			
 
				 //	}
			
 
				-	} else
			
 
				-#endif
			
 
				-#endif
			
 
				-	{
			
 
				-		free(A);
			
 
				+//#endif
			
 
				 	}
			
 
				+#endif /* STARPU_SIMGRID */
			
 
				 
			
 
				+	free(A);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-int starpu_malloc_count(void **A, size_t dim)
			
 
				-{
			
 
				-	if (_starpu_memory_manager_can_allocate_size(dim, 0) == 0)
			
 
				-	{
			
 
				-		size_t freed;
			
 
				-		size_t reclaim = 2 * dim;
			
 
				-		_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", reclaim);
			
 
				-		_STARPU_TRACE_START_MEMRECLAIM(0);
			
 
				-		freed = _starpu_memory_reclaim_generic(0, 0, reclaim);
			
 
				-		_STARPU_TRACE_END_MEMRECLAIM(0);
			
 
				-		if (freed < dim)
			
 
				-		{
			
 
				-			// We could not reclaim enough memory
			
 
				-			*A = NULL;
			
 
				-			return -ENOMEM;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return starpu_malloc(A, dim);
			
 
				-}
			
 
				-
			
 
				-int starpu_free_count(void *A, size_t dim)
			
 
				+int starpu_free(void *A)
			
 
				 {
			
 
				-	_starpu_memory_manager_deallocate_size(dim, 0);
			
 
				-	starpu_free(A);
			
 
				+	return starpu_free_flags(A, 0, STARPU_MALLOC_PINNED);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
@@ -404,12 +422,8 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				 	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				 	switch(kind)
			
 
				 	{
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning TODO we need to call starpu_free
			
 
				-#endif
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			free((void*)addr);
			
 
				-			_starpu_memory_manager_deallocate_size(size, dst_node);
			
 
				 			break;
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 		case STARPU_CUDA_RAM:
			
@@ -424,7 +438,6 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				 			err = cudaFree((void*)addr);
			
 
				 			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				 				STARPU_CUDA_REPORT_ERROR(err);
			
 
				-			_starpu_memory_manager_deallocate_size(size, dst_node);
			
 
				 #endif
			
 
				 			break;
			
 
				 		}
			
@@ -442,7 +455,6 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				                         err = clReleaseMemObject((void*)addr);
			
 
				 			if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 				STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-			_starpu_memory_manager_deallocate_size(size, dst_node);
			
 
				 #endif
			
 
				                         break;
			
 
				 		}
			
@@ -450,5 +462,7 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				 		default:
			
 
				 			STARPU_ABORT();
			
 
				 	}
			
 
				+	_starpu_memory_manager_deallocate_size(size, dst_node);
			
 
				+
			
 
				 }
			
 
				 
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -191,8 +191,8 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
				 
			
 
				 static size_t _starpu_cpu_get_global_mem_size(int devid, struct _starpu_machine_config *config)
			
 
				 {
			
 
				-	ssize_t global_mem;
			
 
				-	ssize_t limit;
			
 
				+	size_t global_mem;
			
 
				+	int limit;
			
 
				 
			
 
				 	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
			
 
				 #ifdef STARPU_DEVEL
			
@@ -350,7 +350,7 @@ int _starpu_cpu_driver_run_once(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
 
				 
			
 
				 int _starpu_cpu_driver_deinit(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	_STARPU_TRACE_WORKER_DEINIT_START
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_START;
			
 
				 
			
 
				 	struct _starpu_worker *cpu_worker;
			
 
				 	cpu_worker = _starpu_get_local_worker_key();
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -73,7 +73,7 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 
				  */
			
 
				 static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
			
 
				 {
			
 
				-	ssize_t limit;
			
 
				+	int limit;
			
 
				 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
			
 
				 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
			
 
				 	char name[30];
			
@@ -101,8 +101,8 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 
				 	props[devid].totalGlobalMem -= to_waste;
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				-	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
			
 
				-			devid, (long) to_waste/(1024*1024), (long) limit, (long) totalGlobalMem/(1024*1024),
			
 
				+	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
			
 
				+			devid, (long) to_waste/(1024*1024), limit, (long) totalGlobalMem/(1024*1024),
			
 
				 			(long) (totalGlobalMem - to_waste)/(1024*1024));
			
 
				 }
			
 
				 
			
@@ -423,7 +423,7 @@ int _starpu_cuda_driver_init(struct starpu_driver *d)
 
				 	snprintf(args->short_name, sizeof(args->short_name), "CUDA %u", devid);
			
 
				 	_STARPU_DEBUG("cuda (%s) dev id %u thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
			
 
				 
			
 
				-	_STARPU_TRACE_WORKER_INIT_END
			
 
				+	_STARPU_TRACE_WORKER_INIT_END;
			
 
				 
			
 
				 	/* tell the main thread that this one is ready */
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
			
@@ -496,7 +496,7 @@ int _starpu_cuda_driver_deinit(struct starpu_driver *d)
 
				 	STARPU_ASSERT(args);
			
 
				 	unsigned memnode = args->memory_node;
			
 
				 
			
 
				-	_STARPU_TRACE_WORKER_DEINIT_START
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_START;
			
 
				 
			
 
				 	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -167,7 +167,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 
			
 
				 		if (_starpu_worker_get_status(workerid) != STATUS_SLEEPING)
			
 
				 		{
			
 
				-			_STARPU_TRACE_WORKER_SLEEP_START
			
 
				+			_STARPU_TRACE_WORKER_SLEEP_START;
			
 
				 			_starpu_worker_restart_sleeping(workerid);
			
 
				 			_starpu_worker_set_status(workerid, STATUS_SLEEPING);
			
 
				 		}
			
@@ -199,7 +199,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 
			
 
				 	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
			
 
				 	{
			
 
				-		_STARPU_TRACE_WORKER_SLEEP_END
			
 
				+		_STARPU_TRACE_WORKER_SLEEP_END;
			
 
				 		_starpu_worker_stop_sleeping(workerid);
			
 
				 		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
			
 
				 	}
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -61,7 +61,7 @@ _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
 
				 
			
 
				 static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
			
 
				 {
			
 
				-	ssize_t limit;
			
 
				+	int limit;
			
 
				 	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
			
 
				 	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
			
 
				 	char name[30];
			
@@ -90,8 +90,8 @@ static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 
				 	to_waste = totalGlobalMem - global_mem[devid];
			
 
				 #endif
			
 
				 
			
 
				-	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
			
 
				-                      devid, (size_t)to_waste/(1024*1024), (size_t)limit, (size_t)totalGlobalMem/(1024*1024),
			
 
				+	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %d MB / Total %ld MB / Remains %ld MB\n",
			
 
				+                      devid, (size_t)to_waste/(1024*1024), limit, (size_t)totalGlobalMem/(1024*1024),
			
 
				                       (size_t)(totalGlobalMem - to_waste)/(1024*1024));
			
 
				 
			
 
				 }
			
@@ -621,7 +621,7 @@ int _starpu_opencl_driver_init(struct starpu_driver *d)
 
				 
			
 
				 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
			
 
				 
			
 
				-	_STARPU_TRACE_WORKER_INIT_END
			
 
				+	_STARPU_TRACE_WORKER_INIT_END;
			
 
				 
			
 
				 	/* tell the main thread that this one is ready */
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
			
@@ -692,7 +692,7 @@ int _starpu_opencl_driver_run_once(struct starpu_driver *d)
 
				 
			
 
				 int _starpu_opencl_driver_deinit(struct starpu_driver *d)
			
 
				 {
			
 
				-	_STARPU_TRACE_WORKER_DEINIT_START
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_START;
			
 
				 
			
 
				 	struct _starpu_worker* args;
			
 
				 	args = _starpu_opencl_get_worker_from_driver(d);
			
--- a/starpufft/examples/testx.c
+++ b/starpufft/examples/testx.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -48,8 +48,8 @@ static void check_fftw(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, in
 
				 	{
			
 
				 		double diff = cabs(out[i]-out_fftw[i]);
			
 
				 		double diff2 = diff * diff;
			
 
				-		double size = cabs(out_fftw[i]);
			
 
				-		double size2 = size * size;
			
 
				+		double dsize = cabs(out_fftw[i]);
			
 
				+		double size2 = dsize * dsize;
			
 
				 		if (diff > max)
			
 
				 			max = diff;
			
 
				 		tot += diff;
			
--- a/tests/datawizard/allocate.c
+++ b/tests/datawizard/allocate.c
@@ -50,35 +50,35 @@ int main(int argc, char **argv)
 
				 		starpu_shutdown();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				-	STARPU_CHECK_RETURN_VALUE_IS((int)global_size, 1*1024*1024, "get_global_memory_size");
			
 
				+	STARPU_CHECK_RETURN_VALUE_IS((int)global_size, 1*1024*1024, "_starpu_memory_manager_get_global_memory_size");
			
 
				 	FPRINTF(stderr, "Available memory size on node 0: %ld\n", global_size);
			
 
				 
			
 
				-	ret = starpu_malloc_count((void **)&buffer, 1);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc_count");
			
 
				+	ret = starpu_malloc_flags((void **)&buffer, 1, STARPU_MALLOC_COUNT);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc_flags");
			
 
				 	FPRINTF(stderr, "Allocation succesfull for 1 b\n");
			
 
				 
			
 
				-	ret = starpu_malloc_count((void **)&buffer2, 1*1024*512);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc_count");
			
 
				+	ret = starpu_malloc_flags((void **)&buffer2, 1*1024*512, STARPU_MALLOC_COUNT);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc_flags");
			
 
				 	FPRINTF(stderr, "Allocation succesfull for %d b\n", 1*1024*512);
			
 
				 
			
 
				-	ret = starpu_malloc_count((void **)&buffer3, 1*1024*512);
			
 
				-	STARPU_CHECK_RETURN_VALUE_IS(ret, -ENOMEM, "starpu_malloc_count");
			
 
				+	ret = starpu_malloc_flags((void **)&buffer3, 1*1024*512, STARPU_MALLOC_COUNT);
			
 
				+	STARPU_CHECK_RETURN_VALUE_IS(ret, -ENOMEM, "starpu_malloc_flags");
			
 
				 	FPRINTF(stderr, "Allocation failed for %d b\n", 1*1024*512);
			
 
				 
			
 
				-	ret = starpu_malloc((void **)&buffer3, 1*1024*512);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
			
 
				+	ret = starpu_malloc_flags((void **)&buffer3, 1*1024*512, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc_flags");
			
 
				 	FPRINTF(stderr, "Allocation successful for %d b\n", 1*1024*512);
			
 
				-	starpu_free(buffer3);
			
 
				+	starpu_free_flags(buffer3, 1*1024*512, 0);
			
 
				 
			
 
				-	starpu_free_count(buffer2, 1*1024*512);
			
 
				+	starpu_free_flags(buffer2, 1*1024*512, STARPU_MALLOC_COUNT);
			
 
				 	FPRINTF(stderr, "Freeing %d b\n", 1*1024*512);
			
 
				 
			
 
				-	ret = starpu_malloc_count((void **)&buffer3, 1*1024*512);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc_count");
			
 
				+	ret = starpu_malloc_flags((void **)&buffer3, 1*1024*512, STARPU_MALLOC_COUNT);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc_flags");
			
 
				 	FPRINTF(stderr, "Allocation succesfull for %d b\n", 1*1024*512);
			
 
				 
			
 
				-	starpu_free_count(buffer3, 1*1024*512);
			
 
				-	starpu_free_count(buffer, 1);
			
 
				+	starpu_free_flags(buffer3, 1*1024*512, STARPU_MALLOC_COUNT);
			
 
				+	starpu_free_flags(buffer, 1, STARPU_MALLOC_COUNT);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 	return 0;
			
--- a/tests/datawizard/interfaces/block/block_interface.c
+++ b/tests/datawizard/interfaces/block/block_interface.c
@@ -34,8 +34,8 @@ extern void test_block_opencl_func(void *buffers[], void *args);
 
				 #endif
			
 
				 
			
 
				 
			
 
				-static starpu_data_handle_t block_handle;
			
 
				-static starpu_data_handle_t block2_handle;
			
 
				+static starpu_data_handle_t _block_handle;
			
 
				+static starpu_data_handle_t _block2_handle;
			
 
				 
			
 
				 struct test_config block_config =
			
 
				 {
			
@@ -46,14 +46,14 @@ struct test_config block_config =
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.opencl_func   = test_block_opencl_func,
			
 
				 #endif
			
 
				-	.handle        = &block_handle,
			
 
				-	.dummy_handle  = &block2_handle,
			
 
				+	.handle        = &_block_handle,
			
 
				+	.dummy_handle  = &_block2_handle,
			
 
				 	.copy_failed   = SUCCESS,
			
 
				 	.name          = "block_interface"
			
 
				 };
			
 
				 
			
 
				-static int block[NX*NY*NZ];
			
 
				-static int block2[NX*NY*NZ];
			
 
				+static int _block[NX*NY*NZ];
			
 
				+static int _block2[NX*NY*NZ];
			
 
				 
			
 
				 static void
			
 
				 register_data(void)
			
@@ -64,34 +64,34 @@ register_data(void)
 
				 	for (k = 0; k < NZ; k++)
			
 
				 		for (j = 0; j < NY; j++)
			
 
				 			for (i = 0; i < NX; i++)
			
 
				-                                block[(k*NX*NY)+(j*NX)+i] = val++;
			
 
				+                                _block[(k*NX*NY)+(j*NX)+i] = val++;
			
 
				 
			
 
				 	/* Registering data */
			
 
				-	starpu_block_data_register(&block_handle,
			
 
				+	starpu_block_data_register(&_block_handle,
			
 
				                                     0,
			
 
				-                                    (uintptr_t)block,
			
 
				+                                    (uintptr_t)_block,
			
 
				 				    NX,
			
 
				 				    NX * NY,
			
 
				 				    NX,
			
 
				 				    NY,
			
 
				 				    NZ,
			
 
				-				    sizeof(block[0]));
			
 
				-	starpu_block_data_register(&block2_handle,
			
 
				+				    sizeof(_block[0]));
			
 
				+	starpu_block_data_register(&_block2_handle,
			
 
				                                     0,
			
 
				-                                    (uintptr_t)block2,
			
 
				+                                    (uintptr_t)_block2,
			
 
				 				    NX,
			
 
				 				    NX * NY,
			
 
				 				    NX,
			
 
				 				    NY,
			
 
				 				    NZ,
			
 
				-				    sizeof(block2[0]));
			
 
				+				    sizeof(_block2[0]));
			
 
				 }
			
 
				 
			
 
				 static void
			
 
				 unregister_data(void)
			
 
				 {
			
 
				-	starpu_data_unregister(block_handle);
			
 
				-	starpu_data_unregister(block2_handle);
			
 
				+	starpu_data_unregister(_block_handle);
			
 
				+	starpu_data_unregister(_block2_handle);
			
 
				 }
			
 
				 
			
 
				 static void test_block_cpu_func(void *buffers[], void *args)
			
--- a/tests/microbenchs/tasks_overhead.c
+++ b/tests/microbenchs/tasks_overhead.c
@@ -130,6 +130,7 @@ int main(int argc, char **argv)
 
				 			tasks[i].handles[buffer] = data_handles[buffer];
			
 
				 		}
			
 
				 	}
			
 
				+	tasks[ntasks-1].detach = 0;
			
 
				 
			
 
				 	gettimeofday(&start_submit, NULL);
			
 
				 	for (i = 1; i < ntasks; i++)
			
@@ -150,14 +151,15 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* wait for the execution of the tasks */
			
 
				 	gettimeofday(&start_exec, NULL);
			
 
				-	ret = starpu_tag_wait((starpu_tag_t)(ntasks - 1));
			
 
				+	ret = starpu_task_wait(&tasks[ntasks-1]);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_tag_wait");
			
 
				 	gettimeofday(&end_exec, NULL);
			
 
				 
			
 
				+	for (i = 1; i < ntasks; i++)
			
 
				+		starpu_task_clean(&tasks[i]);
			
 
				+
			
 
				 	for (buffer = 0; buffer < nbuffers; buffer++)
			
 
				-	{
			
 
				 		starpu_data_unregister(data_handles[buffer]);
			
 
				-	}
			
 
				 
			
 
				 	timing_submit = (double)((end_submit.tv_sec - start_submit.tv_sec)*1000000 + (end_submit.tv_usec - start_submit.tv_usec));
			
 
				 	timing_exec = (double)((end_exec.tv_sec - start_exec.tv_sec)*1000000 + (end_exec.tv_usec - start_exec.tv_usec));
			
--- a/tests/microbenchs/tasks_size_overhead.c
+++ b/tests/microbenchs/tasks_size_overhead.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -190,6 +190,9 @@ int main(int argc, char **argv)
 
				 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 			gettimeofday(&end, NULL);
			
 
				 
			
 
				+			for (i = 0; i < ntasks; i++)
			
 
				+				starpu_task_clean(&tasks[i]);
			
 
				+
			
 
				 			timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				 			FPRINTF(stdout, "%u\t%f\t", size, timing/1000000);
			
--- a/tools/valgrind/openmpi.suppr
+++ b/tools/valgrind/openmpi.suppr
@@ -140,3 +140,24 @@
 
				    fun:ompi_ddt_create_vector
			
 
				    fun:PMPI_Type_vector
			
 
				 }
			
 
				+
			
 
				+{
			
 
				+   suppr19
			
 
				+   Memcheck:Leak
			
 
				+   fun:malloc
			
 
				+   fun:ompi_ddt_create
			
 
				+   fun:ompi_ddt_create_vector
			
 
				+   fun:PMPI_Type_vector
			
 
				+}
			
 
				+
			
 
				+{
			
 
				+   suppr20
			
 
				+   Memcheck:Leak
			
 
				+   fun:malloc
			
 
				+   fun:ompi_free_list_grow
			
 
				+   obj:*
			
 
				+   obj:*
			
 
				+   obj:*
			
 
				+   fun:PMPI_Isend
			
 
				+}
			
 
				+
			
--- a/tools/valgrind/starpu.suppr
+++ b/tools/valgrind/starpu.suppr
@@ -90,3 +90,20 @@
 
				    fun:_starpu_load_bus_performance_files
			
 
				    ...
			
 
				 }
			
 
				+
			
 
				+{
			
 
				+   This is racy, but we don't care, if the function was called a bit earlier we would have had a different value
			
 
				+   Helgrind:Race
			
 
				+   fun: _starpu_fifo_empty
			
 
				+   fun: pop_task_eager_policy
			
 
				+   ...
			
 
				+}
			
 
				+
			
 
				+{
			
 
				+   This is the counterpart of the suppression above
			
 
				+   Helgrind:Race
			
 
				+   fun: _starpu_fifo_push_task
			
 
				+   fun: push_task_eager_policy
			
 
				+   ...
			
 
				+}
			
 
				+