13 years ago · 058743e72a
--- a/.gitignore
+++ b/.gitignore
@@ -248,3 +248,40 @@ starpu.log
 
				 /tools/starpu_perfmodel_plot.1
			
 
				 /starpu-1.0.pc
			
 
				 /gcc-plugin/examples/cholesky/cholesky
			
 
				+/gcc-plugin/tests/*.log
			
 
				+/test/*.log
			
 
				+/examples/*.log
			
 
				+/tests/main/declare_deps_after_submission
			
 
				+/tests/main/declare_deps_after_submission_synchronous
			
 
				+/tests/main/declare_deps_in_callback
			
 
				+/tests/main/deprecated
			
 
				+/tests/main/deprecated_buffer
			
 
				+/tests/main/deprecated_func
			
 
				+/tests/main/empty_task
			
 
				+/tests/main/empty_task_chain
			
 
				+/tests/main/empty_task_sync_point
			
 
				+/tests/main/empty_task_sync_point_tasks
			
 
				+/tests/main/execute_on_a_specific_worker
			
 
				+/tests/main/get_current_task
			
 
				+/tests/main/insert_task
			
 
				+/tests/main/multiformat_data_release
			
 
				+/tests/main/multiformat_handle_conversion
			
 
				+/tests/main/multithreaded
			
 
				+/tests/main/multithreaded_init
			
 
				+/tests/main/regenerate
			
 
				+/tests/main/restart
			
 
				+/tests/main/starpu_init
			
 
				+/tests/main/starpu_task_bundle
			
 
				+/tests/main/starpu_task_wait
			
 
				+/tests/main/starpu_task_wait_for_all
			
 
				+/tests/main/starpu_worker_exists
			
 
				+/tests/main/static_restartable
			
 
				+/tests/main/static_restartable_tag
			
 
				+/tests/main/static_restartable_using_initializer
			
 
				+/tests/main/subgraph_repeat
			
 
				+/tests/main/subgraph_repeat_regenerate
			
 
				+/tests/main/tag_wait_api
			
 
				+/tests/main/task_wait_api
			
 
				+/tests/main/wait_all_regenerable_tasks
			
 
				+/tools/starpu_workers_activity
			
 
				+/tests/datawizard/interfaces/copy_interfaces
			
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -597,7 +597,8 @@ parallel CPU implementation of the computation to be achieved. This can also be
 
				 useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
			
 
				 work collectively on a single task, the completion time of tasks on CPUs become
			
 
				 comparable to the completion time on GPUs, thus relieving from granularity
			
 
				-discrepancy concerns.
			
 
				+discrepancy concerns. Hwloc support needs to be enabled to get good performance,
			
 
				+otherwise StarPU will not know how to better group cores.
			
 
				 
			
 
				 Two modes of execution exist to accomodate with existing usages.
			
 
				 
			
--- a/examples/basic_examples/block.c
+++ b/examples/basic_examples/block.c
@@ -60,6 +60,8 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
 
				         if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
 
				                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+		task->destroy = 0;
			
 
				+                starpu_task_destroy(task);
			
 
				                 return 1;
			
 
				 	}
			
 
				 
			
--- a/examples/basic_examples/vector_scal_c.c
+++ b/examples/basic_examples/vector_scal_c.c
@@ -53,13 +53,14 @@ static struct starpu_codelet cl =
 
				 	.model = &vector_scal_model
			
 
				 };
			
 
				 
			
 
				-void compute_(int *F_NX, float *vector)
			
 
				+int compute_(int *F_NX, float *vector)
			
 
				 {
			
 
				         int NX = *F_NX;
			
 
				 	int ret;
			
 
				 
			
 
				 	/* Initialize StarPU with default configuration */
			
 
				 	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
			
@@ -98,7 +99,7 @@ void compute_(int *F_NX, float *vector)
 
				 
			
 
				 	/* execute the task on any eligible computational ressource */
			
 
				 	ret = starpu_task_submit(task);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 	/* StarPU does not need to manipulate the array anymore so we can stop
			
 
				  	 * monitoring it */
			
@@ -106,4 +107,6 @@ void compute_(int *F_NX, float *vector)
 
				 
			
 
				 	/* terminate StarPU, no task can be submitted after */
			
 
				 	starpu_shutdown();
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -297,6 +297,11 @@ int main(int argc, char **argv)
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+	size /= 4;
			
 
				+	nblocks /= 4;
			
 
				+#endif
			
 
				+
			
 
				 	ret = starpu_init(NULL);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -275,6 +275,10 @@ int main(int argc, char **argv)
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+	niter /= 10;
			
 
				+#endif
			
 
				+
			
 
				 	ret = starpu_init(NULL);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
--- a/examples/opt/pi/pi.c
+++ b/examples/opt/pi/pi.c
@@ -88,6 +88,7 @@ int main(int argc, char **argv)
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* Initialize the random number generator */
			
--- a/examples/opt/pi/pi_redux.c
+++ b/examples/opt/pi/pi_redux.c
@@ -189,19 +189,6 @@ static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-/* The amount of work does not depend on the data size at all :) */
			
 
				-static size_t size_base(struct starpu_task *task, unsigned nimpl)
			
 
				-{
			
 
				-	return NSHOT_PER_TASK;
			
 
				-}
			
 
				-
			
 
				-static struct starpu_perfmodel model =
			
 
				-{
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.size_base = size_base,
			
 
				-	.symbol = "monte_carlo_pi_redux"
			
 
				-};
			
 
				-
			
 
				 static struct starpu_codelet pi_cl =
			
 
				 {
			
 
				 	.where =
			
@@ -319,6 +306,7 @@ int main(int argc, char **argv)
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* Launch a Random Number Generator (RNG) on each worker */
			
--- a/gcc-plugin/src/starpu.c
+++ b/gcc-plugin/src/starpu.c
@@ -1735,6 +1735,12 @@ build_codelet_initializer (tree task_decl)
 
				     return init;
			
 
				   }
			
 
				 
			
 
				+  tree codelet_name ()
			
 
				+  {
			
 
				+    const char *name = IDENTIFIER_POINTER (DECL_NAME (task_decl));
			
 
				+    return build_string_literal (strlen (name) + 1, name);
			
 
				+  }
			
 
				+
			
 
				   tree where_init (tree impls)
			
 
				   {
			
 
				     tree impl;
			
@@ -1841,7 +1847,8 @@ build_codelet_initializer (tree task_decl)
 
				   impls = task_implementation_list (task_decl);
			
 
				 
			
 
				   inits =
			
 
				-    chain_trees (field_initializer ("where", where_init (impls)),
			
 
				+    chain_trees (field_initializer ("name", codelet_name ()),
			
 
				+		 field_initializer ("where", where_init (impls)),
			
 
				 		 field_initializer ("nbuffers", pointer_arg_count ()),
			
 
				 		 field_initializer ("modes", access_mode_array ()),
			
 
				 		 field_initializer ("cpu_funcs",
			
--- a/gcc-plugin/tests/Makefile.am
+++ b/gcc-plugin/tests/Makefile.am
@@ -74,4 +74,4 @@ check-hook:
 
				 endif !HAVE_GUILE
			
 
				 
			
 
				 showcheck:
			
 
				-	-cat $(TEST_LOGS) /dev/null
			
 
				+	-cat $(TEST_LOGS) /dev/null
			
--- a/gcc-plugin/tests/mocks.h
+++ b/gcc-plugin/tests/mocks.h
@@ -57,6 +57,7 @@ const struct insert_task_argument *expected_insert_task_arguments;
 
				 int
			
 
				 starpu_insert_task (struct starpu_codelet *cl, ...)
			
 
				 {
			
 
				+  assert (cl->name != NULL && strlen (cl->name) > 0);
			
 
				   assert (cl->where == (STARPU_CPU | STARPU_OPENCL));
			
 
				 
			
 
				   /* TODO: Call `cpu_func' & co. and check whether they do the right
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -211,9 +211,9 @@ static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned st
 
				 			 * _starpu_compar_handles.  */
			
 
				 			continue;
			
 
				 
			
 
				+                j->task->status = STARPU_TASK_BLOCKED_ON_DATA;
			
 
				                 if (attempt_to_submit_data_request_from_job(j, buf))
			
 
				 		{
			
 
				-                        j->task->status = STARPU_TASK_BLOCKED_ON_DATA;
			
 
				 			return 1;
			
 
				                 }
			
 
				 	}
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -106,6 +106,7 @@ static void _starpu_tag_free(void *_tag)
 
				 #endif
			
 
				 
			
 
				 		_starpu_spin_unlock(&tag->lock);
			
 
				+		_starpu_spin_destroy(&tag->lock);
			
 
				 
			
 
				 		free(tag);
			
 
				 	}
			
@@ -128,6 +129,10 @@ void _starpu_tag_clear(void)
 
				 {
			
 
				 	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
			
 
				 
			
 
				+	/* XXX: _starpu_tag_free takes the tag spinlocks while we are keeping
			
 
				+	 * the global rwlock. This contradicts the lock order of
			
 
				+	 * starpu_tag_wait_array. Should not be a problem in practice since
			
 
				+	 * _starpu_tag_clear is called at shutdown only. */
			
 
				 	_starpu_htbl_clear_tags(&tag_htbl, 0, _starpu_tag_free);
			
 
				 
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -498,8 +498,10 @@ static void benchmark_all_gpu_devices(void)
 
				 	hwloc_topology_load(hwtopology);
			
 
				 #endif
			
 
				 
			
 
				-	/* TODO: use hwloc */
			
 
				-#ifdef __linux__
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	hwloc_cpuset_t former_cpuset = hwloc_bitmap_alloc();
			
 
				+	hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
			
 
				+#elif __linux__
			
 
				 	/* Save the current cpu binding */
			
 
				 	cpu_set_t former_process_affinity;
			
 
				 	int ret;
			
@@ -545,8 +547,9 @@ static void benchmark_all_gpu_devices(void)
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	/* FIXME: use hwloc */
			
 
				-#ifdef __linux__
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
			
 
				+#elif __linux__
			
 
				 	/* Restore the former affinity */
			
 
				 	ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
			
 
				 	if (ret)
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -966,10 +966,11 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 	{
			
 
				 		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
 
				 		struct starpu_per_arch_perfmodel *per_arch_model = &model->per_arch[arch][nimpl];
			
 
				-		struct starpu_htbl32_node *history = per_arch_model->history;
			
 
				+		struct starpu_htbl32_node *history;
			
 
				 		struct starpu_history_entry *entry;
			
 
				 
			
 
				 		_STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
			
 
				+		history = per_arch_model->history;
			
 
				 		entry = (struct starpu_history_entry *) _starpu_htbl_search_32(history, key);
			
 
				 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 
			
@@ -997,11 +998,13 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
				 
			
 
				 	per_arch_model = &model->per_arch[arch][nimpl];
			
 
				 
			
 
				+	_STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
			
 
				 	history = per_arch_model->history;
			
 
				-	if (!history)
			
 
				+	if (!history) {
			
 
				+		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 		return NAN;
			
 
				+	}
			
 
				 
			
 
				-	_STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
			
 
				 	entry = (struct starpu_history_entry *) _starpu_htbl_search_32(history, key);
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -290,7 +290,7 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 
				 	_starpu_initialize_workers_bindid(config);
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	int ncuda = -1;
			
 
				+	int ncuda;
			
 
				 	ncuda = starpu_get_env_number("STARPU_NCUDA");
			
 
				 
			
 
				 	/* STARPU_NCUDA is not set. Did the user specify anything ? */
			
@@ -303,14 +303,30 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 
				 		/* The user did not disable CUDA. We need to initialize CUDA
			
 
				  		 * early to count the number of devices */
			
 
				 		_starpu_init_cuda();
			
 
				+		int nb_devices = _starpu_get_cuda_device_count();
			
 
				 
			
 
				 		if (ncuda == -1)
			
 
				 		{
			
 
				 			/* Nothing was specified, so let's choose ! */
			
 
				-			ncuda = STARPU_MIN(_starpu_get_cuda_device_count(), STARPU_MAXCUDADEVS);
			
 
				+			ncuda = nb_devices;
			
 
				+			if (ncuda > STARPU_MAXCUDADEVS)
			
 
				+			{
			
 
				+				fprintf(stderr,
			
 
				+					"# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n",
			
 
				+					nb_devices, STARPU_MAXCUDADEVS);
			
 
				+				ncuda = STARPU_MAXCUDADEVS;
			
 
				+			}
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				+			if (ncuda > nb_devices)
			
 
				+			{
			
 
				+				/* The user requires more CUDA devices than there is available */
			
 
				+				fprintf(stderr,
			
 
				+					"# Warning: %d CUDA devices requested. Only %d available.\n",
			
 
				+					ncuda, nb_devices);
			
 
				+				ncuda = nb_devices;
			
 
				+			}
			
 
				 			/* Let's make sure this value is OK. */
			
 
				 			if (ncuda > STARPU_MAXCUDADEVS)
			
 
				 			{
			
@@ -319,14 +335,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 
				 					ncuda, STARPU_MAXCUDADEVS);
			
 
				 				ncuda = STARPU_MAXCUDADEVS;
			
 
				 			}
			
 
				-
			
 
				-			if ((unsigned) ncuda > _starpu_get_cuda_device_count())
			
 
				-			{
			
 
				-				fprintf(stderr,
			
 
				-					"# Warning: %d CUDA devices requested. Only %d available.\n",
			
 
				-					ncuda, _starpu_get_cuda_device_count());
			
 
				-				ncuda = _starpu_get_cuda_device_count();
			
 
				-			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -369,12 +377,19 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 
				  		 * early to count the number of devices */
			
 
				 		_starpu_opencl_init();
			
 
				 		int nb_devices;
			
 
				-		nb_devices = STARPU_MIN(_starpu_opencl_get_device_count(), STARPU_MAXOPENCLDEVS);
			
 
				+		nb_devices = _starpu_opencl_get_device_count();
			
 
				 
			
 
				 		if (nopencl == -1)
			
 
				 		{
			
 
				 			/* Nothing was specified, so let's choose ! */
			
 
				 			nopencl = nb_devices;
			
 
				+			if (nopencl > STARPU_MAXOPENCLDEVS)
			
 
				+			{
			
 
				+				fprintf(stderr,
			
 
				+					"# Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldadev=xxx to update the maximum value of supported OpenCL devices.\n",
			
 
				+					nb_devices, STARPU_MAXOPENCLDEVS);
			
 
				+				nopencl = STARPU_MAXOPENCLDEVS;
			
 
				+			}
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -385,8 +400,9 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 
				 				fprintf(stderr,
			
 
				 					"# Warning: %d OpenCL devices requested. Only %d available.\n",
			
 
				 					nopencl, nb_devices);
			
 
				-					topology->nopenclgpus = nb_devices;
			
 
				+				nopencl = nb_devices;
			
 
				 			}
			
 
				+			/* Let's make sure this value is OK. */
			
 
				 			if (nopencl > STARPU_MAXOPENCLDEVS)
			
 
				 			{
			
 
				 				fprintf(stderr,
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -169,7 +169,11 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		if ((cl->type == STARPU_SPMD) || (cl->type == STARPU_FORKJOIN))
			
 
				+		if ((cl->type == STARPU_SPMD)
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+				|| (cl->type == STARPU_FORKJOIN)
			
 
				+#endif
			
 
				+				)
			
 
				 		{
			
 
				 			/* TODO we should add other types of constraints */
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -443,8 +443,10 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 
			
 
				 		if (hop != nhops - 1)
			
 
				 		{
			
 
				-			if (!reused_requests[hop + 1])
			
 
				+			if (!reused_requests[hop + 1]) {
			
 
				 				r->next_req[r->next_req_count++] = requests[hop + 1];
			
 
				+				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
			
 
				+			}
			
 
				 		}
			
 
				 		else
			
 
				 			_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -382,6 +382,12 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, uint32_t gatherin
 
				 			still_valid[node]?newstate:STARPU_INVALID;
			
 
				 	}
			
 
				 
			
 
				+	for (child = 0; child < root_handle->nchildren; child++)
			
 
				+	{
			
 
				+		struct _starpu_data_state *child_handle = &root_handle->children[child];
			
 
				+		_starpu_spin_unlock(&child_handle->header_lock);
			
 
				+	}
			
 
				+
			
 
				 	/* there is no child anymore */
			
 
				 	free(root_handle->children);
			
 
				 	root_handle->children = NULL;
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -630,8 +630,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 		else
			
 
				 		{
			
 
				 			/* Are all plans contiguous */
			
 
				-                        /* XXX non contiguous buffers are not properly supported yet. (TODO) */
			
 
				-                        STARPU_ASSERT(0);
			
 
				+                        STARPU_ASSERT_MSG(0, "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
			
 
				                 }
			
 
				         }
			
 
				 	else
			
@@ -697,8 +696,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				                 else
			
 
				 		{
			
 
				 			/* Are all plans contiguous */
			
 
				-                        /* XXX non contiguous buffers are not properly supported yet. (TODO) */
			
 
				-                        STARPU_ASSERT(0);
			
 
				+                        STARPU_ASSERT_MSG(0, "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
			
 
				                 }
			
 
				         }
			
 
				 	else
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -605,8 +605,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				         int err,ret;
			
 
				 
			
 
				-	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
			
 
				-	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
			
 
				+	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
			
 
				 
			
 
				 	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
			
 
				                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
@@ -625,8 +624,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				 	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				         int err, ret;
			
 
				 
			
 
				-	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
			
 
				-	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
			
 
				+	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
			
 
				 
			
 
				         err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
			
 
				                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -267,7 +267,7 @@ static void free_multiformat_buffer_on_node(void *data_interface, uint32_t node)
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 		case STARPU_OPENCL_RAM:
			
 
				-			/* TODO */
			
 
				+			STARPU_ASSERT_MSG(0, "XXX multiformat not supported on OpenCL yet (TODO)");
			
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
@@ -711,7 +711,8 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
 
				 	(void) dst_interface;
			
 
				 	(void) src_node;
			
 
				 	(void) dst_node;
			
 
				-/* TODO */
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(0, "XXX multiformat copy OpenCL-OpenCL not supported yet (TODO)");
			
 
				 	return 0;
			
 
				 }
			
 
				 #endif
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -282,6 +282,7 @@ int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mod
 
				 		while (!wrapper.finished)
			
 
				 			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
			
 
				 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
			
 
				+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
			
 
				 	}
			
 
				 
			
 
				 	/* At that moment, the caller holds a reference to the piece of data.
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -20,6 +20,7 @@
 
				 
			
 
				 #include <math.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_profiling.h>
			
 
				 #include <drivers/driver_common/driver_common.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/debug.h>
			
@@ -30,6 +31,7 @@
 
				 static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_args, int is_parallel_task, int rank, enum starpu_perf_archtype perf_arch)
			
 
				 {
			
 
				 	int ret;
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				 	struct timespec codelet_start, codelet_end;
			
 
				 
			
 
				 	struct starpu_task *task = j->task;
			
@@ -51,7 +53,8 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 
				 	if (is_parallel_task)
			
 
				 		_STARPU_PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
			
 
				 
			
 
				-	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank);
			
 
				+	/* Give profiling variable */
			
 
				+	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank, profiling);
			
 
				 
			
 
				 	/* In case this is a Fork-join parallel task, the worker does not
			
 
				 	 * execute the kernel at all. */
			
@@ -68,7 +71,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 
				 			_starpu_bind_thread_on_cpu(cpu_args->config, cpu_args->bindid);
			
 
				 	}
			
 
				 
			
 
				-	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank);
			
 
				+	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank, profiling);
			
 
				 
			
 
				 	if (is_parallel_task)
			
 
				 		_STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
@@ -76,7 +79,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 
				 	if (rank == 0)
			
 
				 	{
			
 
				 		_starpu_driver_update_job_feedback(j, cpu_args,
			
 
				-				perf_arch, &codelet_start, &codelet_end);
			
 
				+				perf_arch, &codelet_start, &codelet_end, profiling);
			
 
				 		_starpu_push_task_output(j, 0);
			
 
				 	}
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -20,6 +20,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_cuda.h>
			
 
				+#include <starpu_profiling.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <common/config.h>
			
 
				 #include <core/debug.h>
			
@@ -198,6 +199,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
				 
			
 
				 	struct timespec codelet_start, codelet_end;
			
 
				 
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				 	unsigned calibrate_model = 0;
			
 
				 
			
 
				 	STARPU_ASSERT(task);
			
@@ -223,7 +225,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
 
				 
			
 
				-	_starpu_driver_start_job(args, j, &codelet_start, 0);
			
 
				+	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
			
 
				 
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 	/* We make sure we do manipulate the proper device */
			
@@ -236,9 +238,9 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
				 	STARPU_ASSERT(func);
			
 
				 	func(task->interfaces, task->cl_arg);
			
 
				 
			
 
				-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
			
 
				+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
			
 
				 
			
 
				-	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
			
 
				+	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, profiling);
			
 
				 
			
 
				 	_starpu_push_task_output(j, mask);
			
 
				 
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
@@ -27,12 +27,11 @@
 
				 #include <core/sched_policy.h>
			
 
				 #include <top/starpu_top_core.h>
			
 
				 
			
 
				-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank)
			
 
				+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank, int profiling)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				 	struct starpu_codelet *cl = task->cl;
			
 
				 	struct starpu_task_profiling_info *profiling_info;
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				 	int starpu_top=_starpu_top_status_get();
			
 
				 	int workerid = args->workerid;
			
 
				 	unsigned calibrate_model = 0;
			
@@ -65,12 +64,11 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 
				 	_STARPU_TRACE_START_CODELET_BODY(j);
			
 
				 }
			
 
				 
			
 
				-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank)
			
 
				+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				 	struct starpu_codelet *cl = task->cl;
			
 
				 	struct starpu_task_profiling_info *profiling_info = task->profiling_info;
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				 	int starpu_top=_starpu_top_status_get();
			
 
				 	int workerid = args->workerid;
			
 
				 	unsigned calibrate_model = 0;
			
@@ -93,7 +91,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 
				 }
			
 
				 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
			
 
				 					enum starpu_perf_archtype perf_arch,
			
 
				-					struct timespec *codelet_start, struct timespec *codelet_end)
			
 
				+					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
			
 
				 {
			
 
				 	struct starpu_task_profiling_info *profiling_info = j->task->profiling_info;
			
 
				 	struct timespec measured_ts;
			
@@ -101,13 +99,12 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
				 	int workerid = worker_args->workerid;
			
 
				 	struct starpu_codelet *cl = j->task->cl;
			
 
				 	int calibrate_model = 0;
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				 	int updated = 0;
			
 
				 
			
 
				-	if (cl->model && _starpu_get_calibrate_flag())
			
 
				+	if (cl->model && cl->model->benchmarking)
			
 
				 		calibrate_model = 1;
			
 
				 
			
 
				-	if (profiling_info || calibrate_model)
			
 
				+	if ((profiling && profiling_info) || calibrate_model)
			
 
				 	{
			
 
				 		starpu_timespec_sub(codelet_end, codelet_start, &measured_ts);
			
 
				 		measured = starpu_timing_timespec_to_us(&measured_ts);
			
--- a/src/drivers/driver_common/driver_common.h
+++ b/src/drivers/driver_common/driver_common.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -24,12 +24,12 @@
 
				 #include <common/utils.h>
			
 
				 
			
 
				 void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
			
 
				-			      struct timespec *codelet_start, int rank);
			
 
				+			      struct timespec *codelet_start, int rank, int profiling);
			
 
				 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch,
			
 
				-			    struct timespec *codelet_end, int rank);
			
 
				+			    struct timespec *codelet_end, int rank, int profiling);
			
 
				 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
			
 
				 					enum starpu_perf_archtype perf_arch,
			
 
				-					struct timespec *codelet_start, struct timespec *codelet_end);
			
 
				+					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
			
 
				 
			
 
				 void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *mutex);
			
 
				 
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -20,6 +20,7 @@
 
				 
			
 
				 #include <math.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_profiling.h>
			
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/debug.h>
			
@@ -579,6 +580,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
				 	STARPU_ASSERT(j);
			
 
				 	struct starpu_task *task = j->task;
			
 
				 
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				 	struct timespec codelet_start, codelet_end;
			
 
				 
			
 
				 	STARPU_ASSERT(task);
			
@@ -594,16 +596,16 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
				 		return -EAGAIN;
			
 
				 	}
			
 
				 
			
 
				-	_starpu_driver_start_job(args, j, &codelet_start, 0);
			
 
				+	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
			
 
				 
			
 
				 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
			
 
				 	STARPU_ASSERT(func);
			
 
				 	func(task->interfaces, task->cl_arg);
			
 
				 
			
 
				-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
			
 
				+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
			
 
				 
			
 
				 	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
			
 
				-							&codelet_start, &codelet_end);
			
 
				+					   &codelet_start, &codelet_end, profiling);
			
 
				 
			
 
				 	_starpu_push_task_output(j, mask);
			
 
				 
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -307,11 +307,15 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 		for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) 
			
 
				 		{
			
 
				 			/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+			pthread_mutex_t *sched_mutex;
			
 
				+			pthread_cond_t *sched_cond;
			
 
				+			starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
			
 
				+			_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[worker]);
			
 
				 			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
			
 
				 			exp_end[worker_ctx][nimpl] = exp_start[worker] + exp_len[worker];
			
 
				 			if (exp_end[worker_ctx][nimpl] > max_exp_end)
			
 
				  				max_exp_end = exp_end[worker_ctx][nimpl];
			
 
				-			
			
 
				+			_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[worker]);
			
 
				 			if (!starpu_worker_can_execute_task(worker, task, nimpl))
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -34,8 +34,6 @@
 
				 #define DBL_MAX __DBL_MAX__
			
 
				 #endif
			
 
				 
			
 
				-static pthread_mutex_t big_lock;
			
 
				-
			
 
				 static unsigned nworkers, ncombinedworkers;
			
 
				 //static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
			
 
				 //static unsigned napplicable_perf_archtypes = 0;
			
@@ -93,18 +91,18 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				-
			
 
				 	if (is_basic_worker)
			
 
				 	{
			
 
				 		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
			
 
				 		/* TODO */
			
 
				 		task->predicted_transfer = 0;
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[best_workerid]);
			
 
				 		worker_exp_len[best_workerid] += task->predicted;
			
 
				 		worker_exp_end[best_workerid] = exp_end_predicted;
			
 
				 		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
			
 
				 
			
 
				 		ntasks[best_workerid]++;
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[best_workerid]);
			
 
				 
			
 
				 		ret = starpu_push_local_task(best_workerid, task, prio);
			
 
				 	}
			
@@ -135,19 +133,19 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 			/* TODO */
			
 
				 			alias->predicted_transfer = 0;
			
 
				 
			
 
				+			_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[local_worker]);
			
 
				 			worker_exp_len[local_worker] += alias->predicted;
			
 
				 			worker_exp_end[local_worker] = exp_end_predicted;
			
 
				 			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
			
 
				 
			
 
				 			ntasks[local_worker]++;
			
 
				+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[local_worker]);
			
 
				 
			
 
				 			ret |= starpu_push_local_task(local_worker, alias, prio);
			
 
				 		}
			
 
				 
			
 
				 	}
			
 
				 
			
 
				-	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				-
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -245,10 +243,12 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 	{
			
 
				 		worker = sched_ctx->workerids[worker_ctx];
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[worker]);
			
 
				 		worker_exp_start[worker] = STARPU_MAX(worker_exp_start[worker], starpu_timing_now());
			
 
				 		worker_exp_end[worker] = worker_exp_start[worker] + worker_exp_len[worker];
			
 
				 		if (worker_exp_end[worker] > max_exp_end)
			
 
				 			max_exp_end = worker_exp_end[worker];
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[worker]);
			
 
				 	}
			
 
				 
			
 
				 	unsigned nimpl;
			
@@ -325,8 +325,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 		} //end for
			
 
				 	}
			
 
				 
			
 
				-	if (unknown)
			
 
				-	{
			
 
				+	if (unknown) {
			
 
				 		forced_best = ntasks_best;
			
 
				 		forced_best_ctx = ntasks_best_ctx;
			
 
				 		forced_nimpl = nimpl_best;
			
@@ -483,7 +482,6 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
				 		_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
			
 
				 	}
			
 
				 
			
 
				-	_STARPU_PTHREAD_MUTEX_INIT(&big_lock, NULL);
			
 
				 
			
 
				 	/* We pre-compute an array of all the perfmodel archs that are applicable */
			
 
				 	unsigned total_worker_count = nworkers_ctx + ncombinedworkers;
			
--- a/src/util/malloc.c
+++ b/src/util/malloc.c
@@ -24,7 +24,7 @@
 
				 #include <starpu_cuda.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				+#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
			
 
				 struct malloc_pinned_codelet_struct
			
 
				 {
			
 
				 	void **ptr;
			
@@ -41,7 +41,7 @@ struct malloc_pinned_codelet_struct
 
				 //}
			
 
				 //#endif
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				 static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 {
			
 
				 	struct malloc_pinned_codelet_struct *s = arg;
			
@@ -53,7 +53,7 @@ static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA)// || defined(STARPU_USE_OPENCL)
			
 
				+#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
			
 
				 static struct starpu_perfmodel malloc_pinned_model =
			
 
				 {
			
 
				 	.type = STARPU_HISTORY_BASED,
			
@@ -81,12 +81,14 @@ int starpu_malloc(void **A, size_t dim)
 
				 	if (_starpu_can_submit_cuda_task())
			
 
				 	{
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		cudaError_t cures;
			
 
				+		cures = cudaHostAlloc(A, dim, cudaHostAllocPortable);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+#else
			
 
				 		int push_res;
			
 
				 
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning TODO: CUDA4 is able to directly allocate from any thread without having to launch a task
			
 
				-#endif
			
 
				-
			
 
				 		struct malloc_pinned_codelet_struct s =
			
 
				 		{
			
 
				 			.ptr = A,
			
@@ -106,6 +108,7 @@ int starpu_malloc(void **A, size_t dim)
 
				 		push_res = starpu_task_submit(task);
			
 
				 		STARPU_ASSERT(push_res != -ENODEV);
			
 
				 #endif
			
 
				+#endif
			
 
				 	}
			
 
				 //	else if (_starpu_can_submit_opencl_task())
			
 
				 //	{
			
@@ -142,7 +145,7 @@ int starpu_malloc(void **A, size_t dim)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				 static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 {
			
 
				 	cudaError_t cures;
			
@@ -161,7 +164,7 @@ static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, vo
 
				 //}
			
 
				 //#endif
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) // || defined(STARPU_USE_OPENCL)
			
 
				+#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)) // || defined(STARPU_USE_OPENCL)
			
 
				 static struct starpu_perfmodel free_pinned_model =
			
 
				 {
			
 
				 	.type = STARPU_HISTORY_BASED,
			
@@ -185,16 +188,21 @@ int starpu_free(void *A)
 
				 		return -EDEADLK;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+	if (_starpu_can_submit_cuda_task())
			
 
				+	{
			
 
				+#ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				 	if (!_starpu_is_initialized())
			
 
				 	{
			
 
				+#endif
			
 
				 		/* This is especially useful when starpu_free is called from
			
 
				  		 * the GCC-plugin. starpu_shutdown will probably have already
			
 
				 		 * been called, so we will not be able to submit a task. */
			
 
				 		cudaError_t err = cudaFreeHost(A);
			
 
				 		if (STARPU_UNLIKELY(err))
			
 
				 			STARPU_CUDA_REPORT_ERROR(err);
			
 
				+#ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				 	}
			
 
				-	else if (_starpu_can_submit_cuda_task())
			
 
				+	else
			
 
				 	{
			
 
				 		int push_res;
			
 
				 
			
@@ -211,6 +219,7 @@ int starpu_free(void *A)
 
				 		push_res = starpu_task_submit(task);
			
 
				 		STARPU_ASSERT(push_res != -ENODEV);
			
 
				 	}
			
 
				+#endif
			
 
				 //	else if (_starpu_can_submit_opencl_task())
			
 
				 //	{
			
 
				 //#ifdef STARPU_USE_OPENCL
			
@@ -230,7 +239,7 @@ int starpu_free(void *A)
 
				 //		STARPU_ASSERT(push_res != -ENODEV);
			
 
				 //#endif
			
 
				 //	}
			
 
				-	else
			
 
				+	} else
			
 
				 #endif
			
 
				 	{
			
 
				 		free(A);
			
--- a/src/util/starpu_insert_task.c
+++ b/src/util/starpu_insert_task.c
@@ -78,5 +78,12 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 
				 
			
 
				 	va_start(varg_list, cl);
			
 
				         struct starpu_task *task = starpu_task_create();
			
 
				-        return _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
			
 
				+	int ret = _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
			
 
				+
			
 
				+	if (ret == -ENODEV)
			
 
				+	{
			
 
				+		task->destroy = 0;
			
 
				+		starpu_task_destroy(task);
			
 
				+	}
			
 
				+        return ret;
			
 
				 }
			
--- a/tests/main/subgraph_repeat.c
+++ b/tests/main/subgraph_repeat.c
@@ -62,19 +62,19 @@ static struct starpu_codelet dummy_codelet =
 
				 
			
 
				 static void callback_task_D(void *arg __attribute__((unused)))
			
 
				 {
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 	loop_cnt++;
			
 
				 
			
 
				 	if (loop_cnt == niter)
			
 
				 	{
			
 
				 		/* We are done */
			
 
				-		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 		_STARPU_PTHREAD_COND_SIGNAL(&cond);
			
 
				 		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		int ret;
			
 
				-
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 		/* Let's go for another iteration */
			
 
				 		ret = starpu_task_submit(&taskA); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		ret = starpu_task_submit(&taskB); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
--- a/tests/overlap/overlap.c
+++ b/tests/overlap/overlap.c
@@ -25,7 +25,11 @@
 
				 #include <pthread.h>
			
 
				 #include "../helper.h"
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+#define NTASKS	1000
			
 
				+#else
			
 
				 #define NTASKS	10000
			
 
				+#endif
			
 
				 #define VECTORSIZE	1024
			
 
				 #define TASKDURATION	24U
			
 
				 
			
--- a/tools/dev/check_register.sh
+++ b/tools/dev/check_register.sh
@@ -1,37 +0,0 @@
 
				-#!/bin/bash
			
 
				-
			
 
				-# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				-#
			
 
				-# Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				-#
			
 
				-# StarPU is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU Lesser General Public License as published by
			
 
				-# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				-# your option) any later version.
			
 
				-#
			
 
				-# StarPU is distributed in the hope that it will be useful, but
			
 
				-# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-#
			
 
				-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				-
			
 
				-stcolor=$(tput sgr0)
			
 
				-datacolor=$(tput setaf 2)
			
 
				-filecolor=$(tput setaf 1)
			
 
				-
			
 
				-process_file()
			
 
				-{
			
 
				-    datas=$(grep "data_register(" $f| awk -F',' '{print $1}' | awk -F'(' '{print $2}' | tr -d '&' | sed 's/\[/\\\[/g' | sed 's/\]/\\\]/g' | sed 's/\*/\\\*/g')
			
 
				-    for data in $datas ; do
			
 
				-	x=$(grep "data_unregister($data" $1)
			
 
				-	if test "$x" == "" ; then
			
 
				-	    x=$(grep "data_unregister_no_coherency($data" $1)
			
 
				-	    if test "$x" == "" ; then
			
 
				-		echo "Error. File <${filecolor}$1${stcolor}>. Handle <${datacolor}$data${stcolor}> is not unregistered"
			
 
				-	    fi
			
 
				-	fi
			
 
				-    done
			
 
				-}
			
 
				-
			
 
				-for f in $(find tests -type f -not -path "*svn*") ; do process_file $f ; done
			
 
				-for f in $(find examples -type f -not -path "*svn*") ; do process_file $f ; done
			
--- a/tools/dev/experimental/cuda_check_return_values.cocci
+++ b/tools/dev/experimental/cuda_check_return_values.cocci
@@ -51,7 +51,9 @@ E@p = cuda_func(...);
 
				 
			
 
				 
			
 
				 @initialize:python depends on report || org@
			
 
				+from re import sub
			
 
				 msg = "Ignoring the return value of %s."
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				 
			
 
				 @no_assignment@
			
 
				 identifier cuda_func =~ "^cuda";
			
@@ -70,7 +72,7 @@ position no_assignment.p;
 
				 p << no_assignment.p;
			
 
				 func << no_assignment.cuda_func;
			
 
				 @@
			
 
				-coccilib.org.print_todo(p[0], msg % func)
			
 
				+coccilib.org.print_todo(p[0], orgmsg % func)
			
 
				 
			
 
				 @depends on no_assignment && patch@
			
 
				 identifier no_assignment.cuda_func;
			
--- a/tools/dev/experimental/destroy_task_on_error.cocci
+++ b/tools/dev/experimental/destroy_task_on_error.cocci
@@ -0,0 +1,123 @@
 
				+/*
			
 
				+ * StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * When the submission of a task fails, StarPU cannot destroy the task, even if
			
 
				+ * the destroy flag is set. So we have to destroy it ourselves while handling
			
 
				+ * the error.
			
 
				+ *
			
 
				+ * TODO: match if statments without braces.
			
 
				+ */
			
 
				+
			
 
				+virtual context
			
 
				+virtual org
			
 
				+virtual patch
			
 
				+virtual report
			
 
				+
			
 
				+@initialize:python depends on org || report@
			
 
				+msg = "Warning: in %s(): "
			
 
				+msg+= "\"%s\" should probably be destroyed in the body of the if statement"
			
 
				+from re import sub
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				+
			
 
				+@r@
			
 
				+local idexpression t;
			
 
				+identifier err;
			
 
				+identifier f;
			
 
				+position p;
			
 
				+@@
			
 
				+f(...)
			
 
				+{
			
 
				+<+...
			
 
				+(
			
 
				+err = starpu_task_submit(t);
			
 
				+|
			
 
				+int err = starpu_task_submit(t);
			
 
				+)
			
 
				+if@p(
			
 
				+(
			
 
				+err == -ENODEV
			
 
				+|
			
 
				+err != 0
			
 
				+|
			
 
				+STARPU_UNLIKELY(err == -ENODEV)
			
 
				+|
			
 
				+STARPU_UNLIKELY(err != 0)
			
 
				+)
			
 
				+ )
			
 
				+{
			
 
				+... when != starpu_task_destroy(t);
			
 
				+    when != exit(...);
			
 
				+    when != STARPU_ASSERT(...);
			
 
				+    when != return 77;
			
 
				+}
			
 
				+...+>
			
 
				+}
			
 
				+
			
 
				+// Context mode.
			
 
				+@depends on r && context@
			
 
				+position r.p;
			
 
				+@@
			
 
				+*if@p(...) { ... }
			
 
				+
			
 
				+// Org mode.
			
 
				+@script:python depends on r && org@
			
 
				+p << r.p;
			
 
				+t << r.t;
			
 
				+f << r.f;
			
 
				+@@
			
 
				+coccilib.org.print_todo(p[0], orgmsg % (f,t))
			
 
				+
			
 
				+// Patch mode.
			
 
				+// XXX: Instead of "..." we could use a statement list (statement list SS). But
			
 
				+// it does not seem to work with if there is a "return" statement in the body
			
 
				+// of the if condition.
			
 
				+// Using "..." makes the patch ugly, but this may be fixed in a future version
			
 
				+// of spatch.
			
 
				+@depends on r && patch@
			
 
				+local idexpression r.t;
			
 
				+position r.p;
			
 
				+identifier r.f;
			
 
				+@@
			
 
				+f(...)
			
 
				+{
			
 
				+<+...
			
 
				+if@p (...)
			
 
				+(
			
 
				+{
			
 
				+...
			
 
				++ t->destroy = 0;
			
 
				++ starpu_task_destroy(t);
			
 
				+return ...;
			
 
				+}
			
 
				+|
			
 
				+{
			
 
				+...
			
 
				++ t->destroy = 0;
			
 
				++ starpu_task_destroy(t);
			
 
				+}
			
 
				+)
			
 
				+...+>
			
 
				+}
			
 
				+
			
 
				+// Report mode.
			
 
				+@script:python depends on r && report@
			
 
				+p << r.p;
			
 
				+t << r.t;
			
 
				+f << r.f;
			
 
				+@@
			
 
				+coccilib.report.print_report(p[0], msg % (f,t))
			
--- a/tools/dev/experimental/destroy_task_on_error_test.c
+++ b/tools/dev/experimental/destroy_task_on_error_test.c
@@ -0,0 +1,65 @@
 
				+/*
			
 
				+ * StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+static void
			
 
				+good_0(void)
			
 
				+{
			
 
				+	struct starpu_task *task;
			
 
				+	task = starpu_task_create();
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	if (ret == -ENODEV)
			
 
				+	{
			
 
				+		fprintf(stderr, "fail\n");
			
 
				+		starpu_task_destroy(task);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+bad_0(void)
			
 
				+{
			
 
				+	struct starpu_task *task1, *task2;
			
 
				+
			
 
				+	task1 = starpu_task_create();
			
 
				+	int ret = starpu_task_submit(task1);
			
 
				+	if (ret == -ENODEV)
			
 
				+	{
			
 
				+		fprintf(stderr, "Fail\n");
			
 
				+	}
			
 
				+
			
 
				+	task2 = starpu_task_create();
			
 
				+	ret = starpu_task_submit(task2);
			
 
				+	if (ret == -ENODEV)
			
 
				+	{
			
 
				+		fprintf(stderr, "Fail\n");
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+bad_unlikely(void)
			
 
				+{
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				+	task = starpu_task_create();
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				+		error();
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_destroy(task);
			
 
				+}
			
--- a/tools/dev/experimental/function_call_termination_condition.cocci
+++ b/tools/dev/experimental/function_call_termination_condition.cocci
@@ -34,6 +34,8 @@ virtual report
 
				 
			
 
				 @initialize:python depends on report || org@
			
 
				 msg="Function call in the termination condition of a for loop"
			
 
				+from re import sub
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				 
			
 
				 @r@
			
 
				 type t;
			
@@ -62,17 +64,16 @@ expression r.E1;
 
				 @script:python depends on r && org@
			
 
				 p << r.p;
			
 
				 @@
			
 
				-coccilib.org.print_todo(p[0], msg)
			
 
				+coccilib.org.print_todo(p[0], orgmsg)
			
 
				 
			
 
				 @depends on r && patch@
			
 
				-type r.t;
			
 
				 expression r.E1, E2, E3;
			
 
				 identifier r.it;
			
 
				 position r.p;
			
 
				 @@
			
 
				 -for@p(it = E1; it < E3; E2) 
			
 
				-+t max = E3;
			
 
				-+for(it = E1; i < max; E2) 
			
 
				++max = E3;
			
 
				++for(it = E1; it < max; E2)
			
 
				 {
			
 
				 ...
			
 
				 }
			
--- a/tools/dev/experimental/name_codelets.cocci
+++ b/tools/dev/experimental/name_codelets.cocci
@@ -31,6 +31,8 @@ virtual report
 
				 
			
 
				 @initialize:python depends on org || report@
			
 
				 msg = "Warning: %s has no attribute name"
			
 
				+from re import sub
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				 
			
 
				 @found@
			
 
				 identifier cl;
			
@@ -61,7 +63,7 @@ position found.p;
 
				 cl << found.cl;
			
 
				 p << found.p;
			
 
				 @@
			
 
				-coccilib.org.print_todo(p[0], msg % cl)
			
 
				+coccilib.org.print_todo(p[0], orgmsg % cl)
			
 
				 
			
 
				 // Patch mode.
			
 
				 @script:python stringify depends on found && !named && patch@
			
--- a/tools/dev/experimental/not_unlocked_mutex.cocci
+++ b/tools/dev/experimental/not_unlocked_mutex.cocci
@@ -21,6 +21,8 @@ virtual report
 
				 
			
 
				 @initialize:python depends on report || org@
			
 
				 msg="The mutex \"%s\" is not unlocked when leaving \"%s\""
			
 
				+from re import sub
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				 
			
 
				 @r@
			
 
				 expression E;
			
@@ -53,7 +55,7 @@ f << r.func;
 
				 E << r.E;
			
 
				 @@
			
 
				 for p in ps:
			
 
				-	coccilib.org.print_todo(p, msg % (E, f))
			
 
				+	coccilib.org.print_todo(p, orgmsg % (E, f))
			
 
				 
			
 
				 
			
 
				 @depends on r && patch@
			
--- a/tools/dev/experimental/opencl_check_return_values.cocci
+++ b/tools/dev/experimental/opencl_check_return_values.cocci
@@ -51,5 +51,5 @@ coccilib.report.print_report(p[0], msg)
 
				 p << ignored_return_value.p;
			
 
				 func << ignored_return_value.opencl_func;
			
 
				 @@
			
 
				-msg = "Ignoring the return value of %s." % func
			
 
				+msg = "Ignoring the return value of =%s=." % func
			
 
				 coccilib.org.print_todo(p[0], msg)
			
--- a/tools/dev/experimental/skip_valgrind.cocci
+++ b/tools/dev/experimental/skip_valgrind.cocci
@@ -21,6 +21,8 @@ virtual report
 
				 
			
 
				 @initialize:python depends on report || org@
			
 
				 msg="Should you add STARPU_SKIP_IF_VALGRIND; at the beginning of this function ?"
			
 
				+from re import sub
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				 
			
 
				 @find_codelet@
			
 
				 identifier a, b;
			
@@ -66,7 +68,7 @@ position find_codelet.p;
 
				 @script:python depends on find_codelet && !is_empty_codelet && !is_already_ok && org@
			
 
				 p << find_codelet.p;
			
 
				 @@
			
 
				-coccilib.org.print_todo(p[0], msg)
			
 
				+coccilib.org.print_todo(p[0], orgmsg)
			
 
				 
			
 
				 @depends on find_codelet && !is_empty_codelet && !is_already_ok && patch@
			
 
				 identifier find_codelet.a, find_codelet.b;
			
--- a/tools/dev/experimental/unchecked_starpu_function_calls.cocci
+++ b/tools/dev/experimental/unchecked_starpu_function_calls.cocci
@@ -27,6 +27,8 @@ virtual report
 
				 
			
 
				 @initialize:python depends on report || org@
			
 
				 msg = "Unchecked call to %s"
			
 
				+from re import sub
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				 
			
 
				 @unchecked_starpu_func_call@
			
 
				 identifier f;
			
@@ -59,7 +61,7 @@ f(...)
 
				 f << unchecked_starpu_func_call.starpu_function;
			
 
				 p << unchecked_starpu_func_call.p;
			
 
				 @@
			
 
				-coccilib.org.print_todo(p[0], msg % f)
			
 
				+coccilib.org.print_todo(p[0], orgmsg % f)
			
 
				 
			
 
				 // Patch mode.
			
 
				 @has_ret depends on unchecked_starpu_func_call@
			
--- a/tools/dev/experimental/use_starpu_macros.cocci
+++ b/tools/dev/experimental/use_starpu_macros.cocci
@@ -24,6 +24,8 @@ virtual report
 
				 @initialize:python depends on report || org@
			
 
				 d = { 'abort':'STARPU_ABORT', 'assert':'STARPU_ASSERT'}
			
 
				 msg = "Please use %s rather than %s."
			
 
				+from re import sub
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				 
			
 
				 @r@
			
 
				 identifier f =~ "abort|assert";
			
@@ -88,7 +90,7 @@ expression E1, E2;
 
				 p << r.p;
			
 
				 f << r.f;
			
 
				 @@
			
 
				-coccilib.org.print_todo(p[0], msg % (d[str(f)], f))
			
 
				+coccilib.org.print_todo(p[0], orgmsg % (d[str(f)], f))
			
 
				 
			
 
				 @script:python depends on min && org@
			
 
				 p << min.p;
			
--- a/tools/dev/experimental/use_starpu_pthread_macros.cocci
+++ b/tools/dev/experimental/use_starpu_pthread_macros.cocci
@@ -43,6 +43,8 @@ d = {
 
				 'pthread_spin_unlock'     : '_STARPU_PTHREAD_SPIN_UNLOCK'
			
 
				 }
			
 
				 msg = "Use %s instead of %s."
			
 
				+from re import sub
			
 
				+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
			
 
				 
			
 
				 @r@
			
 
				 identifier f =~ "^pthread_";
			
@@ -66,9 +68,9 @@ p << r.p;
 
				 f << r.f;
			
 
				 @@
			
 
				 if str(f) in d.keys():
			
 
				-	coccilib.org.print_todo(p[0], msg % (d[str(f)], f))
			
 
				+	coccilib.org.print_todo(p[0], orgmsg % (d[str(f)], f))
			
 
				 else:
			
 
				-	coccilib.org.print_todo(p[0], "Shouldn't %s be wrapped in a macro ?" % str(f))
			
 
				+	coccilib.org.print_todo(p[0], "Shouldn't =%s= be wrapped in a macro ?" % str(f))
			
 
				 
			
 
				 
			
 
				 //
			
--- a/tools/dev/internal/check_unrenamed_list_types.sh
+++ b/tools/dev/internal/check_unrenamed_list_types.sh
--- a/tools/dev/internal/rename_internal.sed
+++ b/tools/dev/internal/rename_internal.sed
--- a/tools/dev/internal/rename_internal.sh
+++ b/tools/dev/internal/rename_internal.sh
--- a/tools/dev/mycocci.sh
+++ b/tools/dev/mycocci.sh
@@ -121,7 +121,7 @@ do
 
				 		scripts_dir=$OPTARG;
			
 
				 		;;
			
 
				 	t)
			
 
				-		target=$OPTARG;
			
 
				+		target="$target $OPTARG";
			
 
				 		;;
			
 
				 	\?)
			
 
				 		echo "Invalid option -$OPTARG"
			
--- a/tools/dev/starpu_use_macro.sed
+++ b/tools/dev/starpu_use_macro.sed
@@ -1,15 +0,0 @@
 
				-# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				-#
			
 
				-# Copyright (C) 2012 INRIA
			
 
				-#
			
 
				-# StarPU is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU Lesser General Public License as published by
			
 
				-# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				-# your option) any later version.
			
 
				-#
			
 
				-# StarPU is distributed in the hope that it will be useful, but
			
 
				-# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-#
			
 
				-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				-s/^#if STARPU_USE_/#ifdef STARPU_USE_/
			
--- a/tools/dev/starpu_use_macro.sh
+++ b/tools/dev/starpu_use_macro.sh
@@ -15,4 +15,4 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-find . -type f -name "*.c" -not -path "*svn*" -exec sed -i -f $(dirname $0)/starpu_use_macro.sed {} \;
			
 
				+find . -type f -name "*.c" -not -path "*svn*" -exec sed -i 's/^#if STARPU_USE_/#ifdef STARPU_USE_/' {} \;
			
--- a/tools/valgrind/starpu.suppr
+++ b/tools/valgrind/starpu.suppr
@@ -2,7 +2,7 @@
 
				    config.running is not racy from starpu_shutdown
			
 
				    Helgrind:Race
			
 
				    fun:starpu_shutdown
			
 
				-   fun:main
			
 
				+   ...
			
 
				 }
			
 
				 
			
 
				 {
			
@@ -25,3 +25,25 @@
 
				    fun:_starpu_msi_cache_miss
			
 
				    ...
			
 
				 }
			
 
				+
			
 
				+{
			
 
				+   known race, but not problematic in practice, see comment in _starpu_tag_clear
			
 
				+   Helgrind:LockOrder
			
 
				+   ...
			
 
				+   fun:_starpu_tag_free
			
 
				+   fun:_starpu_htbl_clear_tags
			
 
				+   ...
			
 
				+   fun:_starpu_tag_clear
			
 
				+   fun:starpu_shutdown
			
 
				+   ...
			
 
				+}
			
 
				+
			
 
				+
			
 
				+{
			
 
				+   There is actually no race on current_mode, because the mode can not change unexpectedly, until _starpu_notify_data_dependencies() is called further down. Valgrind can not know about such software rwlock.
			
 
				+   Helgrind:Race
			
 
				+   fun:_starpu_release_data_on_node
			
 
				+   fun:_starpu_push_task_output
			
 
				+   ...
			
 
				+}
			
 
				+