Browse Source

another merge

Andra Hugo 13 years ago
parent
commit
058743e72a
53 changed files with 452 additions and 167 deletions
  1. 37 0
      .gitignore
  2. 2 1
      doc/chapters/advanced-examples.texi
  3. 2 0
      examples/basic_examples/block.c
  4. 5 2
      examples/basic_examples/vector_scal_c.c
  5. 5 0
      examples/lu/lu_example.c
  6. 4 0
      examples/mult/xgemm.c
  7. 1 0
      examples/opt/pi/pi.c
  8. 1 13
      examples/opt/pi/pi_redux.c
  9. 8 1
      gcc-plugin/src/starpu.c
  10. 1 1
      gcc-plugin/tests/Makefile.am
  11. 1 0
      gcc-plugin/tests/mocks.h
  12. 1 1
      src/core/dependencies/data_concurrency.c
  13. 5 0
      src/core/dependencies/tags.c
  14. 8 5
      src/core/perfmodel/perfmodel_bus.c
  15. 6 3
      src/core/perfmodel/perfmodel_history.c
  16. 28 12
      src/core/topology.c
  17. 5 1
      src/core/workers.c
  18. 3 1
      src/datawizard/coherency.c
  19. 6 0
      src/datawizard/filters.c
  20. 3 5
      src/datawizard/interfaces/block_interface.c
  21. 3 5
      src/datawizard/interfaces/matrix_interface.c
  22. 3 2
      src/datawizard/interfaces/multiformat_interface.c
  23. 1 0
      src/datawizard/user_interactions.c
  24. 6 3
      src/drivers/cpu/driver_cpu.c
  25. 5 3
      src/drivers/cuda/driver_cuda.c
  26. 6 9
      src/drivers/driver_common/driver_common.c
  27. 4 4
      src/drivers/driver_common/driver_common.h
  28. 5 3
      src/drivers/opencl/driver_opencl.c
  29. 5 1
      src/sched_policies/heft.c
  30. 7 9
      src/sched_policies/parallel_heft.c
  31. 20 11
      src/util/malloc.c
  32. 8 1
      src/util/starpu_insert_task.c
  33. 2 2
      tests/main/subgraph_repeat.c
  34. 4 0
      tests/overlap/overlap.c
  35. 0 37
      tools/dev/check_register.sh
  36. 3 1
      tools/dev/experimental/cuda_check_return_values.cocci
  37. 123 0
      tools/dev/experimental/destroy_task_on_error.cocci
  38. 65 0
      tools/dev/experimental/destroy_task_on_error_test.c
  39. 5 4
      tools/dev/experimental/function_call_termination_condition.cocci
  40. 3 1
      tools/dev/experimental/name_codelets.cocci
  41. 3 1
      tools/dev/experimental/not_unlocked_mutex.cocci
  42. 1 1
      tools/dev/experimental/opencl_check_return_values.cocci
  43. 3 1
      tools/dev/experimental/skip_valgrind.cocci
  44. 3 1
      tools/dev/experimental/unchecked_starpu_function_calls.cocci
  45. 3 1
      tools/dev/experimental/use_starpu_macros.cocci
  46. 4 2
      tools/dev/experimental/use_starpu_pthread_macros.cocci
  47. 0 0
      tools/dev/internal/check_unrenamed_list_types.sh
  48. 0 0
      tools/dev/internal/rename_internal.sed
  49. 0 0
      tools/dev/internal/rename_internal.sh
  50. 1 1
      tools/dev/mycocci.sh
  51. 0 15
      tools/dev/starpu_use_macro.sed
  52. 1 1
      tools/dev/starpu_use_macro.sh
  53. 23 1
      tools/valgrind/starpu.suppr

+ 37 - 0
.gitignore

@@ -248,3 +248,40 @@ starpu.log
 /tools/starpu_perfmodel_plot.1
 /starpu-1.0.pc
 /gcc-plugin/examples/cholesky/cholesky
+/gcc-plugin/tests/*.log
+/test/*.log
+/examples/*.log
+/tests/main/declare_deps_after_submission
+/tests/main/declare_deps_after_submission_synchronous
+/tests/main/declare_deps_in_callback
+/tests/main/deprecated
+/tests/main/deprecated_buffer
+/tests/main/deprecated_func
+/tests/main/empty_task
+/tests/main/empty_task_chain
+/tests/main/empty_task_sync_point
+/tests/main/empty_task_sync_point_tasks
+/tests/main/execute_on_a_specific_worker
+/tests/main/get_current_task
+/tests/main/insert_task
+/tests/main/multiformat_data_release
+/tests/main/multiformat_handle_conversion
+/tests/main/multithreaded
+/tests/main/multithreaded_init
+/tests/main/regenerate
+/tests/main/restart
+/tests/main/starpu_init
+/tests/main/starpu_task_bundle
+/tests/main/starpu_task_wait
+/tests/main/starpu_task_wait_for_all
+/tests/main/starpu_worker_exists
+/tests/main/static_restartable
+/tests/main/static_restartable_tag
+/tests/main/static_restartable_using_initializer
+/tests/main/subgraph_repeat
+/tests/main/subgraph_repeat_regenerate
+/tests/main/tag_wait_api
+/tests/main/task_wait_api
+/tests/main/wait_all_regenerable_tasks
+/tools/starpu_workers_activity
+/tests/datawizard/interfaces/copy_interfaces

+ 2 - 1
doc/chapters/advanced-examples.texi

@@ -597,7 +597,8 @@ parallel CPU implementation of the computation to be achieved. This can also be
 useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
 work collectively on a single task, the completion time of tasks on CPUs become
 comparable to the completion time on GPUs, thus relieving from granularity
-discrepancy concerns.
+discrepancy concerns. Hwloc support needs to be enabled to get good performance,
+otherwise StarPU will not know how to better group cores.
 
 Two modes of execution exist to accomodate with existing usages.
 

+ 2 - 0
examples/basic_examples/block.c

@@ -60,6 +60,8 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
         if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
                 FPRINTF(stderr, "No worker may execute this task\n");
+		task->destroy = 0;
+                starpu_task_destroy(task);
                 return 1;
 	}
 

+ 5 - 2
examples/basic_examples/vector_scal_c.c

@@ -53,13 +53,14 @@ static struct starpu_codelet cl =
 	.model = &vector_scal_model
 };
 
-void compute_(int *F_NX, float *vector)
+int compute_(int *F_NX, float *vector)
 {
         int NX = *F_NX;
 	int ret;
 
 	/* Initialize StarPU with default configuration */
 	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
@@ -98,7 +99,7 @@ void compute_(int *F_NX, float *vector)
 
 	/* execute the task on any eligible computational ressource */
 	ret = starpu_task_submit(task);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	/* StarPU does not need to manipulate the array anymore so we can stop
  	 * monitoring it */
@@ -106,4 +107,6 @@ void compute_(int *F_NX, float *vector)
 
 	/* terminate StarPU, no task can be submitted after */
 	starpu_shutdown();
+
+	return ret;
 }

+ 5 - 0
examples/lu/lu_example.c

@@ -297,6 +297,11 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
+#ifdef STARPU_SLOW_MACHINE
+	size /= 4;
+	nblocks /= 4;
+#endif
+
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		return 77;

+ 4 - 0
examples/mult/xgemm.c

@@ -275,6 +275,10 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
+#ifdef STARPU_SLOW_MACHINE
+	niter /= 10;
+#endif
+
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		return 77;

+ 1 - 0
examples/opt/pi/pi.c

@@ -88,6 +88,7 @@ int main(int argc, char **argv)
 	parse_args(argc, argv);
 
 	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Initialize the random number generator */

+ 1 - 13
examples/opt/pi/pi_redux.c

@@ -189,19 +189,6 @@ static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 }
 #endif
 
-/* The amount of work does not depend on the data size at all :) */
-static size_t size_base(struct starpu_task *task, unsigned nimpl)
-{
-	return NSHOT_PER_TASK;
-}
-
-static struct starpu_perfmodel model =
-{
-	.type = STARPU_HISTORY_BASED,
-	.size_base = size_base,
-	.symbol = "monte_carlo_pi_redux"
-};
-
 static struct starpu_codelet pi_cl =
 {
 	.where =
@@ -319,6 +306,7 @@ int main(int argc, char **argv)
 	parse_args(argc, argv);
 
 	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Launch a Random Number Generator (RNG) on each worker */

+ 8 - 1
gcc-plugin/src/starpu.c

@@ -1735,6 +1735,12 @@ build_codelet_initializer (tree task_decl)
     return init;
   }
 
+  tree codelet_name ()
+  {
+    const char *name = IDENTIFIER_POINTER (DECL_NAME (task_decl));
+    return build_string_literal (strlen (name) + 1, name);
+  }
+
   tree where_init (tree impls)
   {
     tree impl;
@@ -1841,7 +1847,8 @@ build_codelet_initializer (tree task_decl)
   impls = task_implementation_list (task_decl);
 
   inits =
-    chain_trees (field_initializer ("where", where_init (impls)),
+    chain_trees (field_initializer ("name", codelet_name ()),
+		 field_initializer ("where", where_init (impls)),
 		 field_initializer ("nbuffers", pointer_arg_count ()),
 		 field_initializer ("modes", access_mode_array ()),
 		 field_initializer ("cpu_funcs",

+ 1 - 1
gcc-plugin/tests/Makefile.am

@@ -74,4 +74,4 @@ check-hook:
 endif !HAVE_GUILE
 
 showcheck:
-	-cat $(TEST_LOGS) /dev/null
+	-cat $(TEST_LOGS) /dev/null

+ 1 - 0
gcc-plugin/tests/mocks.h

@@ -57,6 +57,7 @@ const struct insert_task_argument *expected_insert_task_arguments;
 int
 starpu_insert_task (struct starpu_codelet *cl, ...)
 {
+  assert (cl->name != NULL && strlen (cl->name) > 0);
   assert (cl->where == (STARPU_CPU | STARPU_OPENCL));
 
   /* TODO: Call `cpu_func' & co. and check whether they do the right

+ 1 - 1
src/core/dependencies/data_concurrency.c

@@ -211,9 +211,9 @@ static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned st
 			 * _starpu_compar_handles.  */
 			continue;
 
+                j->task->status = STARPU_TASK_BLOCKED_ON_DATA;
                 if (attempt_to_submit_data_request_from_job(j, buf))
 		{
-                        j->task->status = STARPU_TASK_BLOCKED_ON_DATA;
 			return 1;
                 }
 	}

+ 5 - 0
src/core/dependencies/tags.c

@@ -106,6 +106,7 @@ static void _starpu_tag_free(void *_tag)
 #endif
 
 		_starpu_spin_unlock(&tag->lock);
+		_starpu_spin_destroy(&tag->lock);
 
 		free(tag);
 	}
@@ -128,6 +129,10 @@ void _starpu_tag_clear(void)
 {
 	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
 
+	/* XXX: _starpu_tag_free takes the tag spinlocks while we are keeping
+	 * the global rwlock. This contradicts the lock order of
+	 * starpu_tag_wait_array. Should not be a problem in practice since
+	 * _starpu_tag_clear is called at shutdown only. */
 	_starpu_htbl_clear_tags(&tag_htbl, 0, _starpu_tag_free);
 
 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);

+ 8 - 5
src/core/perfmodel/perfmodel_bus.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -498,8 +498,10 @@ static void benchmark_all_gpu_devices(void)
 	hwloc_topology_load(hwtopology);
 #endif
 
-	/* TODO: use hwloc */
-#ifdef __linux__
+#ifdef STARPU_HAVE_HWLOC
+	hwloc_cpuset_t former_cpuset = hwloc_bitmap_alloc();
+	hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
+#elif __linux__
 	/* Save the current cpu binding */
 	cpu_set_t former_process_affinity;
 	int ret;
@@ -545,8 +547,9 @@ static void benchmark_all_gpu_devices(void)
 	}
 #endif
 
-	/* FIXME: use hwloc */
-#ifdef __linux__
+#ifdef STARPU_HAVE_HWLOC
+	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
+#elif __linux__
 	/* Restore the former affinity */
 	ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
 	if (ret)

+ 6 - 3
src/core/perfmodel/perfmodel_history.c

@@ -966,10 +966,11 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 	{
 		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 		struct starpu_per_arch_perfmodel *per_arch_model = &model->per_arch[arch][nimpl];
-		struct starpu_htbl32_node *history = per_arch_model->history;
+		struct starpu_htbl32_node *history;
 		struct starpu_history_entry *entry;
 
 		_STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
+		history = per_arch_model->history;
 		entry = (struct starpu_history_entry *) _starpu_htbl_search_32(history, key);
 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 
@@ -997,11 +998,13 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
 	per_arch_model = &model->per_arch[arch][nimpl];
 
+	_STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
 	history = per_arch_model->history;
-	if (!history)
+	if (!history) {
+		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 		return NAN;
+	}
 
-	_STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
 	entry = (struct starpu_history_entry *) _starpu_htbl_search_32(history, key);
 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 

+ 28 - 12
src/core/topology.c

@@ -290,7 +290,7 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 	_starpu_initialize_workers_bindid(config);
 
 #ifdef STARPU_USE_CUDA
-	int ncuda = -1;
+	int ncuda;
 	ncuda = starpu_get_env_number("STARPU_NCUDA");
 
 	/* STARPU_NCUDA is not set. Did the user specify anything ? */
@@ -303,14 +303,30 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 		/* The user did not disable CUDA. We need to initialize CUDA
  		 * early to count the number of devices */
 		_starpu_init_cuda();
+		int nb_devices = _starpu_get_cuda_device_count();
 
 		if (ncuda == -1)
 		{
 			/* Nothing was specified, so let's choose ! */
-			ncuda = STARPU_MIN(_starpu_get_cuda_device_count(), STARPU_MAXCUDADEVS);
+			ncuda = nb_devices;
+			if (ncuda > STARPU_MAXCUDADEVS)
+			{
+				fprintf(stderr,
+					"# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n",
+					nb_devices, STARPU_MAXCUDADEVS);
+				ncuda = STARPU_MAXCUDADEVS;
+			}
 		}
 		else
 		{
+			if (ncuda > nb_devices)
+			{
+				/* The user requires more CUDA devices than there is available */
+				fprintf(stderr,
+					"# Warning: %d CUDA devices requested. Only %d available.\n",
+					ncuda, nb_devices);
+				ncuda = nb_devices;
+			}
 			/* Let's make sure this value is OK. */
 			if (ncuda > STARPU_MAXCUDADEVS)
 			{
@@ -319,14 +335,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 					ncuda, STARPU_MAXCUDADEVS);
 				ncuda = STARPU_MAXCUDADEVS;
 			}
-
-			if ((unsigned) ncuda > _starpu_get_cuda_device_count())
-			{
-				fprintf(stderr,
-					"# Warning: %d CUDA devices requested. Only %d available.\n",
-					ncuda, _starpu_get_cuda_device_count());
-				ncuda = _starpu_get_cuda_device_count();
-			}
 		}
 	}
 
@@ -369,12 +377,19 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
  		 * early to count the number of devices */
 		_starpu_opencl_init();
 		int nb_devices;
-		nb_devices = STARPU_MIN(_starpu_opencl_get_device_count(), STARPU_MAXOPENCLDEVS);
+		nb_devices = _starpu_opencl_get_device_count();
 
 		if (nopencl == -1)
 		{
 			/* Nothing was specified, so let's choose ! */
 			nopencl = nb_devices;
+			if (nopencl > STARPU_MAXOPENCLDEVS)
+			{
+				fprintf(stderr,
+					"# Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldadev=xxx to update the maximum value of supported OpenCL devices.\n",
+					nb_devices, STARPU_MAXOPENCLDEVS);
+				nopencl = STARPU_MAXOPENCLDEVS;
+			}
 		}
 		else
 		{
@@ -385,8 +400,9 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 				fprintf(stderr,
 					"# Warning: %d OpenCL devices requested. Only %d available.\n",
 					nopencl, nb_devices);
-					topology->nopenclgpus = nb_devices;
+				nopencl = nb_devices;
 			}
+			/* Let's make sure this value is OK. */
 			if (nopencl > STARPU_MAXOPENCLDEVS)
 			{
 				fprintf(stderr,

+ 5 - 1
src/core/workers.c

@@ -169,7 +169,11 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 	}
 	else
 	{
-		if ((cl->type == STARPU_SPMD) || (cl->type == STARPU_FORKJOIN))
+		if ((cl->type == STARPU_SPMD)
+#ifdef STARPU_HAVE_HWLOC
+				|| (cl->type == STARPU_FORKJOIN)
+#endif
+				)
 		{
 			/* TODO we should add other types of constraints */
 

+ 3 - 1
src/datawizard/coherency.c

@@ -443,8 +443,10 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
 		if (hop != nhops - 1)
 		{
-			if (!reused_requests[hop + 1])
+			if (!reused_requests[hop + 1]) {
 				r->next_req[r->next_req_count++] = requests[hop + 1];
+				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
+			}
 		}
 		else
 			_starpu_data_request_append_callback(r, callback_func, callback_arg);

+ 6 - 0
src/datawizard/filters.c

@@ -382,6 +382,12 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, uint32_t gatherin
 			still_valid[node]?newstate:STARPU_INVALID;
 	}
 
+	for (child = 0; child < root_handle->nchildren; child++)
+	{
+		struct _starpu_data_state *child_handle = &root_handle->children[child];
+		_starpu_spin_unlock(&child_handle->header_lock);
+	}
+
 	/* there is no child anymore */
 	free(root_handle->children);
 	root_handle->children = NULL;

+ 3 - 5
src/datawizard/interfaces/block_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -630,8 +630,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 		else
 		{
 			/* Are all plans contiguous */
-                        /* XXX non contiguous buffers are not properly supported yet. (TODO) */
-                        STARPU_ASSERT(0);
+                        STARPU_ASSERT_MSG(0, "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
                 }
         }
 	else
@@ -697,8 +696,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
                 else
 		{
 			/* Are all plans contiguous */
-                        /* XXX non contiguous buffers are not properly supported yet. (TODO) */
-                        STARPU_ASSERT(0);
+                        STARPU_ASSERT_MSG(0, "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
                 }
         }
 	else

+ 3 - 5
src/datawizard/interfaces/matrix_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -605,8 +605,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 	struct starpu_matrix_interface *dst_matrix = dst_interface;
         int err,ret;
 
-	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
-	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
+	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
 
 	err = starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
@@ -625,8 +624,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 	struct starpu_matrix_interface *dst_matrix = dst_interface;
         int err, ret;
 
-	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
-	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
+	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
 
         err = starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,

+ 3 - 2
src/datawizard/interfaces/multiformat_interface.c

@@ -267,7 +267,7 @@ static void free_multiformat_buffer_on_node(void *data_interface, uint32_t node)
 #endif
 #ifdef STARPU_USE_OPENCL
 		case STARPU_OPENCL_RAM:
-			/* TODO */
+			STARPU_ASSERT_MSG(0, "XXX multiformat not supported on OpenCL yet (TODO)");
 			break;
 #endif
 		default:
@@ -711,7 +711,8 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
 	(void) dst_interface;
 	(void) src_node;
 	(void) dst_node;
-/* TODO */
+
+	STARPU_ASSERT_MSG(0, "XXX multiformat copy OpenCL-OpenCL not supported yet (TODO)");
 	return 0;
 }
 #endif

+ 1 - 0
src/datawizard/user_interactions.c

@@ -282,6 +282,7 @@ int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mod
 		while (!wrapper.finished)
 			_STARPU_PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&wrapper.lock);
 	}
 
 	/* At that moment, the caller holds a reference to the piece of data.

+ 6 - 3
src/drivers/cpu/driver_cpu.c

@@ -20,6 +20,7 @@
 
 #include <math.h>
 #include <starpu.h>
+#include <starpu_profiling.h>
 #include <drivers/driver_common/driver_common.h>
 #include <common/utils.h>
 #include <core/debug.h>
@@ -30,6 +31,7 @@
 static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_args, int is_parallel_task, int rank, enum starpu_perf_archtype perf_arch)
 {
 	int ret;
+	int profiling = starpu_profiling_status_get();
 	struct timespec codelet_start, codelet_end;
 
 	struct starpu_task *task = j->task;
@@ -51,7 +53,8 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 	if (is_parallel_task)
 		_STARPU_PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
 
-	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank);
+	/* Give profiling variable */
+	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank, profiling);
 
 	/* In case this is a Fork-join parallel task, the worker does not
 	 * execute the kernel at all. */
@@ -68,7 +71,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 			_starpu_bind_thread_on_cpu(cpu_args->config, cpu_args->bindid);
 	}
 
-	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank);
+	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank, profiling);
 
 	if (is_parallel_task)
 		_STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
@@ -76,7 +79,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct _starpu_worker *cpu_
 	if (rank == 0)
 	{
 		_starpu_driver_update_job_feedback(j, cpu_args,
-				perf_arch, &codelet_start, &codelet_end);
+				perf_arch, &codelet_start, &codelet_end, profiling);
 		_starpu_push_task_output(j, 0);
 	}
 

+ 5 - 3
src/drivers/cuda/driver_cuda.c

@@ -20,6 +20,7 @@
 
 #include <starpu.h>
 #include <starpu_cuda.h>
+#include <starpu_profiling.h>
 #include <common/utils.h>
 #include <common/config.h>
 #include <core/debug.h>
@@ -198,6 +199,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
 	struct timespec codelet_start, codelet_end;
 
+	int profiling = starpu_profiling_status_get();
 	unsigned calibrate_model = 0;
 
 	STARPU_ASSERT(task);
@@ -223,7 +225,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
 
-	_starpu_driver_start_job(args, j, &codelet_start, 0);
+	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
 
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	/* We make sure we do manipulate the proper device */
@@ -236,9 +238,9 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 	STARPU_ASSERT(func);
 	func(task->interfaces, task->cl_arg);
 
-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
+	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j, mask);
 

+ 6 - 9
src/drivers/driver_common/driver_common.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -27,12 +27,11 @@
 #include <core/sched_policy.h>
 #include <top/starpu_top_core.h>
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank)
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
 	struct starpu_task_profiling_info *profiling_info;
-	int profiling = starpu_profiling_status_get();
 	int starpu_top=_starpu_top_status_get();
 	int workerid = args->workerid;
 	unsigned calibrate_model = 0;
@@ -65,12 +64,11 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	_STARPU_TRACE_START_CODELET_BODY(j);
 }
 
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank)
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
 	struct starpu_task_profiling_info *profiling_info = task->profiling_info;
-	int profiling = starpu_profiling_status_get();
 	int starpu_top=_starpu_top_status_get();
 	int workerid = args->workerid;
 	unsigned calibrate_model = 0;
@@ -93,7 +91,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 }
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
 					enum starpu_perf_archtype perf_arch,
-					struct timespec *codelet_start, struct timespec *codelet_end)
+					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
 {
 	struct starpu_task_profiling_info *profiling_info = j->task->profiling_info;
 	struct timespec measured_ts;
@@ -101,13 +99,12 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 	int workerid = worker_args->workerid;
 	struct starpu_codelet *cl = j->task->cl;
 	int calibrate_model = 0;
-	int profiling = starpu_profiling_status_get();
 	int updated = 0;
 
-	if (cl->model && _starpu_get_calibrate_flag())
+	if (cl->model && cl->model->benchmarking)
 		calibrate_model = 1;
 
-	if (profiling_info || calibrate_model)
+	if ((profiling && profiling_info) || calibrate_model)
 	{
 		starpu_timespec_sub(codelet_end, codelet_start, &measured_ts);
 		measured = starpu_timing_timespec_to_us(&measured_ts);

+ 4 - 4
src/drivers/driver_common/driver_common.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,12 +24,12 @@
 #include <common/utils.h>
 
 void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
-			      struct timespec *codelet_start, int rank);
+			      struct timespec *codelet_start, int rank, int profiling);
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perf_archtype perf_arch,
-			    struct timespec *codelet_end, int rank);
+			    struct timespec *codelet_end, int rank, int profiling);
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
 					enum starpu_perf_archtype perf_arch,
-					struct timespec *codelet_start, struct timespec *codelet_end);
+					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
 
 void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *mutex);
 

+ 5 - 3
src/drivers/opencl/driver_opencl.c

@@ -20,6 +20,7 @@
 
 #include <math.h>
 #include <starpu.h>
+#include <starpu_profiling.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <core/debug.h>
@@ -579,6 +580,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 	STARPU_ASSERT(j);
 	struct starpu_task *task = j->task;
 
+	int profiling = starpu_profiling_status_get();
 	struct timespec codelet_start, codelet_end;
 
 	STARPU_ASSERT(task);
@@ -594,16 +596,16 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &codelet_start, 0);
+	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT(func);
 	func(task->interfaces, task->cl_arg);
 
-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0);
+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
 
 	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
-							&codelet_start, &codelet_end);
+					   &codelet_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j, mask);
 

+ 5 - 1
src/sched_policies/heft.c

@@ -307,11 +307,15 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 		for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) 
 		{
 			/* Sometimes workers didn't take the tasks as early as we expected */
+			pthread_mutex_t *sched_mutex;
+			pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
+			_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[worker]);
 			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
 			exp_end[worker_ctx][nimpl] = exp_start[worker] + exp_len[worker];
 			if (exp_end[worker_ctx][nimpl] > max_exp_end)
  				max_exp_end = exp_end[worker_ctx][nimpl];
-			
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[worker]);
 			if (!starpu_worker_can_execute_task(worker, task, nimpl))
 			{
 				/* no one on that queue may execute this task */

+ 7 - 9
src/sched_policies/parallel_heft.c

@@ -34,8 +34,6 @@
 #define DBL_MAX __DBL_MAX__
 #endif
 
-static pthread_mutex_t big_lock;
-
 static unsigned nworkers, ncombinedworkers;
 //static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
 //static unsigned napplicable_perf_archtypes = 0;
@@ -93,18 +91,18 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 	int ret = 0;
 
-	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
-
 	if (is_basic_worker)
 	{
 		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
 		/* TODO */
 		task->predicted_transfer = 0;
+		_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[best_workerid]);
 		worker_exp_len[best_workerid] += task->predicted;
 		worker_exp_end[best_workerid] = exp_end_predicted;
 		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
 
 		ntasks[best_workerid]++;
+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[best_workerid]);
 
 		ret = starpu_push_local_task(best_workerid, task, prio);
 	}
@@ -135,19 +133,19 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 			/* TODO */
 			alias->predicted_transfer = 0;
 
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[local_worker]);
 			worker_exp_len[local_worker] += alias->predicted;
 			worker_exp_end[local_worker] = exp_end_predicted;
 			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
 
 			ntasks[local_worker]++;
+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[local_worker]);
 
 			ret |= starpu_push_local_task(local_worker, alias, prio);
 		}
 
 	}
 
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
-
 	return ret;
 }
 
@@ -245,10 +243,12 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 	{
 		worker = sched_ctx->workerids[worker_ctx];
 		/* Sometimes workers didn't take the tasks as early as we expected */
+		_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[worker]);
 		worker_exp_start[worker] = STARPU_MAX(worker_exp_start[worker], starpu_timing_now());
 		worker_exp_end[worker] = worker_exp_start[worker] + worker_exp_len[worker];
 		if (worker_exp_end[worker] > max_exp_end)
 			max_exp_end = worker_exp_end[worker];
+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[worker]);
 	}
 
 	unsigned nimpl;
@@ -325,8 +325,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 		} //end for
 	}
 
-	if (unknown)
-	{
+	if (unknown) {
 		forced_best = ntasks_best;
 		forced_best_ctx = ntasks_best_ctx;
 		forced_nimpl = nimpl_best;
@@ -483,7 +482,6 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 		_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
 	}
 
-	_STARPU_PTHREAD_MUTEX_INIT(&big_lock, NULL);
 
 	/* We pre-compute an array of all the perfmodel archs that are applicable */
 	unsigned total_worker_count = nworkers_ctx + ncombinedworkers;

+ 20 - 11
src/util/malloc.c

@@ -24,7 +24,7 @@
 #include <starpu_cuda.h>
 #include <drivers/opencl/driver_opencl.h>
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
 struct malloc_pinned_codelet_struct
 {
 	void **ptr;
@@ -41,7 +41,7 @@ struct malloc_pinned_codelet_struct
 //}
 //#endif
 
-#ifdef STARPU_USE_CUDA
+#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)
 static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 {
 	struct malloc_pinned_codelet_struct *s = arg;
@@ -53,7 +53,7 @@ static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED,
 }
 #endif
 
-#if defined(STARPU_USE_CUDA)// || defined(STARPU_USE_OPENCL)
+#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
 static struct starpu_perfmodel malloc_pinned_model =
 {
 	.type = STARPU_HISTORY_BASED,
@@ -81,12 +81,14 @@ int starpu_malloc(void **A, size_t dim)
 	if (_starpu_can_submit_cuda_task())
 	{
 #ifdef STARPU_USE_CUDA
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		cudaError_t cures;
+		cures = cudaHostAlloc(A, dim, cudaHostAllocPortable);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+#else
 		int push_res;
 
-#ifdef STARPU_DEVEL
-#warning TODO: CUDA4 is able to directly allocate from any thread without having to launch a task
-#endif
-
 		struct malloc_pinned_codelet_struct s =
 		{
 			.ptr = A,
@@ -106,6 +108,7 @@ int starpu_malloc(void **A, size_t dim)
 		push_res = starpu_task_submit(task);
 		STARPU_ASSERT(push_res != -ENODEV);
 #endif
+#endif
 	}
 //	else if (_starpu_can_submit_opencl_task())
 //	{
@@ -142,7 +145,7 @@ int starpu_malloc(void **A, size_t dim)
 	return 0;
 }
 
-#ifdef STARPU_USE_CUDA
+#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)
 static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 {
 	cudaError_t cures;
@@ -161,7 +164,7 @@ static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, vo
 //}
 //#endif
 
-#if defined(STARPU_USE_CUDA) // || defined(STARPU_USE_OPENCL)
+#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)) // || defined(STARPU_USE_OPENCL)
 static struct starpu_perfmodel free_pinned_model =
 {
 	.type = STARPU_HISTORY_BASED,
@@ -185,16 +188,21 @@ int starpu_free(void *A)
 		return -EDEADLK;
 
 #ifdef STARPU_USE_CUDA
+	if (_starpu_can_submit_cuda_task())
+	{
+#ifndef HAVE_CUDA_MEMCPY_PEER
 	if (!_starpu_is_initialized())
 	{
+#endif
 		/* This is especially useful when starpu_free is called from
  		 * the GCC-plugin. starpu_shutdown will probably have already
 		 * been called, so we will not be able to submit a task. */
 		cudaError_t err = cudaFreeHost(A);
 		if (STARPU_UNLIKELY(err))
 			STARPU_CUDA_REPORT_ERROR(err);
+#ifndef HAVE_CUDA_MEMCPY_PEER
 	}
-	else if (_starpu_can_submit_cuda_task())
+	else
 	{
 		int push_res;
 
@@ -211,6 +219,7 @@ int starpu_free(void *A)
 		push_res = starpu_task_submit(task);
 		STARPU_ASSERT(push_res != -ENODEV);
 	}
+#endif
 //	else if (_starpu_can_submit_opencl_task())
 //	{
 //#ifdef STARPU_USE_OPENCL
@@ -230,7 +239,7 @@ int starpu_free(void *A)
 //		STARPU_ASSERT(push_res != -ENODEV);
 //#endif
 //	}
-	else
+	} else
 #endif
 	{
 		free(A);

+ 8 - 1
src/util/starpu_insert_task.c

@@ -78,5 +78,12 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 
 	va_start(varg_list, cl);
         struct starpu_task *task = starpu_task_create();
-        return _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
+	int ret = _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
+
+	if (ret == -ENODEV)
+	{
+		task->destroy = 0;
+		starpu_task_destroy(task);
+	}
+        return ret;
 }

+ 2 - 2
tests/main/subgraph_repeat.c

@@ -62,19 +62,19 @@ static struct starpu_codelet dummy_codelet =
 
 static void callback_task_D(void *arg __attribute__((unused)))
 {
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	loop_cnt++;
 
 	if (loop_cnt == niter)
 	{
 		/* We are done */
-		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		_STARPU_PTHREAD_COND_SIGNAL(&cond);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	}
 	else
 	{
 		int ret;
-
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		/* Let's go for another iteration */
 		ret = starpu_task_submit(&taskA); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		ret = starpu_task_submit(&taskB); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");

+ 4 - 0
tests/overlap/overlap.c

@@ -25,7 +25,11 @@
 #include <pthread.h>
 #include "../helper.h"
 
+#ifdef STARPU_SLOW_MACHINE
+#define NTASKS	1000
+#else
 #define NTASKS	10000
+#endif
 #define VECTORSIZE	1024
 #define TASKDURATION	24U
 

+ 0 - 37
tools/dev/check_register.sh

@@ -1,37 +0,0 @@
-#!/bin/bash
-
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-#
-# Copyright (C) 2011  Centre National de la Recherche Scientifique
-#
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-#
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-
-stcolor=$(tput sgr0)
-datacolor=$(tput setaf 2)
-filecolor=$(tput setaf 1)
-
-process_file()
-{
-    datas=$(grep "data_register(" $f| awk -F',' '{print $1}' | awk -F'(' '{print $2}' | tr -d '&' | sed 's/\[/\\\[/g' | sed 's/\]/\\\]/g' | sed 's/\*/\\\*/g')
-    for data in $datas ; do
-	x=$(grep "data_unregister($data" $1)
-	if test "$x" == "" ; then
-	    x=$(grep "data_unregister_no_coherency($data" $1)
-	    if test "$x" == "" ; then
-		echo "Error. File <${filecolor}$1${stcolor}>. Handle <${datacolor}$data${stcolor}> is not unregistered"
-	    fi
-	fi
-    done
-}
-
-for f in $(find tests -type f -not -path "*svn*") ; do process_file $f ; done
-for f in $(find examples -type f -not -path "*svn*") ; do process_file $f ; done

+ 3 - 1
tools/dev/experimental/cuda_check_return_values.cocci

@@ -51,7 +51,9 @@ E@p = cuda_func(...);
 
 
 @initialize:python depends on report || org@
+from re import sub
 msg = "Ignoring the return value of %s."
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
 
 @no_assignment@
 identifier cuda_func =~ "^cuda";
@@ -70,7 +72,7 @@ position no_assignment.p;
 p << no_assignment.p;
 func << no_assignment.cuda_func;
 @@
-coccilib.org.print_todo(p[0], msg % func)
+coccilib.org.print_todo(p[0], orgmsg % func)
 
 @depends on no_assignment && patch@
 identifier no_assignment.cuda_func;

+ 123 - 0
tools/dev/experimental/destroy_task_on_error.cocci

@@ -0,0 +1,123 @@
+/*
+ * StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * When the submission of a task fails, StarPU cannot destroy the task, even if
+ * the destroy flag is set. So we have to destroy it ourselves while handling
+ * the error.
+ *
+ * TODO: match if statments without braces.
+ */
+
+virtual context
+virtual org
+virtual patch
+virtual report
+
+@initialize:python depends on org || report@
+msg = "Warning: in %s(): "
+msg+= "\"%s\" should probably be destroyed in the body of the if statement"
+from re import sub
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
+
+@r@
+local idexpression t;
+identifier err;
+identifier f;
+position p;
+@@
+f(...)
+{
+<+...
+(
+err = starpu_task_submit(t);
+|
+int err = starpu_task_submit(t);
+)
+if@p(
+(
+err == -ENODEV
+|
+err != 0
+|
+STARPU_UNLIKELY(err == -ENODEV)
+|
+STARPU_UNLIKELY(err != 0)
+)
+ )
+{
+... when != starpu_task_destroy(t);
+    when != exit(...);
+    when != STARPU_ASSERT(...);
+    when != return 77;
+}
+...+>
+}
+
+// Context mode.
+@depends on r && context@
+position r.p;
+@@
+*if@p(...) { ... }
+
+// Org mode.
+@script:python depends on r && org@
+p << r.p;
+t << r.t;
+f << r.f;
+@@
+coccilib.org.print_todo(p[0], orgmsg % (f,t))
+
+// Patch mode.
+// XXX: Instead of "..." we could use a statement list (statement list SS). But
+// it does not seem to work with if there is a "return" statement in the body
+// of the if condition.
+// Using "..." makes the patch ugly, but this may be fixed in a future version
+// of spatch.
+@depends on r && patch@
+local idexpression r.t;
+position r.p;
+identifier r.f;
+@@
+f(...)
+{
+<+...
+if@p (...)
+(
+{
+...
++ t->destroy = 0;
++ starpu_task_destroy(t);
+return ...;
+}
+|
+{
+...
++ t->destroy = 0;
++ starpu_task_destroy(t);
+}
+)
+...+>
+}
+
+// Report mode.
+@script:python depends on r && report@
+p << r.p;
+t << r.t;
+f << r.f;
+@@
+coccilib.report.print_report(p[0], msg % (f,t))

+ 65 - 0
tools/dev/experimental/destroy_task_on_error_test.c

@@ -0,0 +1,65 @@
+/*
+ * StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+static void
+good_0(void)
+{
+	struct starpu_task *task;
+	task = starpu_task_create();
+	int ret = starpu_task_submit(task);
+	if (ret == -ENODEV)
+	{
+		fprintf(stderr, "fail\n");
+		starpu_task_destroy(task);
+	}
+}
+
+static void
+bad_0(void)
+{
+	struct starpu_task *task1, *task2;
+
+	task1 = starpu_task_create();
+	int ret = starpu_task_submit(task1);
+	if (ret == -ENODEV)
+	{
+		fprintf(stderr, "Fail\n");
+	}
+
+	task2 = starpu_task_create();
+	ret = starpu_task_submit(task2);
+	if (ret == -ENODEV)
+	{
+		fprintf(stderr, "Fail\n");
+	}
+}
+
+static void
+bad_unlikely(void)
+{
+	struct starpu_task *task;
+
+	task = starpu_task_create();
+
+	int ret = starpu_task_submit(task);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		error();
+		return 1;
+	}
+
+	starpu_task_destroy(task);
+}

+ 5 - 4
tools/dev/experimental/function_call_termination_condition.cocci

@@ -34,6 +34,8 @@ virtual report
 
 @initialize:python depends on report || org@
 msg="Function call in the termination condition of a for loop"
+from re import sub
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
 
 @r@
 type t;
@@ -62,17 +64,16 @@ expression r.E1;
 @script:python depends on r && org@
 p << r.p;
 @@
-coccilib.org.print_todo(p[0], msg)
+coccilib.org.print_todo(p[0], orgmsg)
 
 @depends on r && patch@
-type r.t;
 expression r.E1, E2, E3;
 identifier r.it;
 position r.p;
 @@
 -for@p(it = E1; it < E3; E2) 
-+t max = E3;
-+for(it = E1; i < max; E2) 
++max = E3;
++for(it = E1; it < max; E2)
 {
 ...
 }

+ 3 - 1
tools/dev/experimental/name_codelets.cocci

@@ -31,6 +31,8 @@ virtual report
 
 @initialize:python depends on org || report@
 msg = "Warning: %s has no attribute name"
+from re import sub
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
 
 @found@
 identifier cl;
@@ -61,7 +63,7 @@ position found.p;
 cl << found.cl;
 p << found.p;
 @@
-coccilib.org.print_todo(p[0], msg % cl)
+coccilib.org.print_todo(p[0], orgmsg % cl)
 
 // Patch mode.
 @script:python stringify depends on found && !named && patch@

+ 3 - 1
tools/dev/experimental/not_unlocked_mutex.cocci

@@ -21,6 +21,8 @@ virtual report
 
 @initialize:python depends on report || org@
 msg="The mutex \"%s\" is not unlocked when leaving \"%s\""
+from re import sub
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
 
 @r@
 expression E;
@@ -53,7 +55,7 @@ f << r.func;
 E << r.E;
 @@
 for p in ps:
-	coccilib.org.print_todo(p, msg % (E, f))
+	coccilib.org.print_todo(p, orgmsg % (E, f))
 
 
 @depends on r && patch@

+ 1 - 1
tools/dev/experimental/opencl_check_return_values.cocci

@@ -51,5 +51,5 @@ coccilib.report.print_report(p[0], msg)
 p << ignored_return_value.p;
 func << ignored_return_value.opencl_func;
 @@
-msg = "Ignoring the return value of %s." % func
+msg = "Ignoring the return value of =%s=." % func
 coccilib.org.print_todo(p[0], msg)

+ 3 - 1
tools/dev/experimental/skip_valgrind.cocci

@@ -21,6 +21,8 @@ virtual report
 
 @initialize:python depends on report || org@
 msg="Should you add STARPU_SKIP_IF_VALGRIND; at the beginning of this function ?"
+from re import sub
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
 
 @find_codelet@
 identifier a, b;
@@ -66,7 +68,7 @@ position find_codelet.p;
 @script:python depends on find_codelet && !is_empty_codelet && !is_already_ok && org@
 p << find_codelet.p;
 @@
-coccilib.org.print_todo(p[0], msg)
+coccilib.org.print_todo(p[0], orgmsg)
 
 @depends on find_codelet && !is_empty_codelet && !is_already_ok && patch@
 identifier find_codelet.a, find_codelet.b;

+ 3 - 1
tools/dev/experimental/unchecked_starpu_function_calls.cocci

@@ -27,6 +27,8 @@ virtual report
 
 @initialize:python depends on report || org@
 msg = "Unchecked call to %s"
+from re import sub
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
 
 @unchecked_starpu_func_call@
 identifier f;
@@ -59,7 +61,7 @@ f(...)
 f << unchecked_starpu_func_call.starpu_function;
 p << unchecked_starpu_func_call.p;
 @@
-coccilib.org.print_todo(p[0], msg % f)
+coccilib.org.print_todo(p[0], orgmsg % f)
 
 // Patch mode.
 @has_ret depends on unchecked_starpu_func_call@

+ 3 - 1
tools/dev/experimental/use_starpu_macros.cocci

@@ -24,6 +24,8 @@ virtual report
 @initialize:python depends on report || org@
 d = { 'abort':'STARPU_ABORT', 'assert':'STARPU_ASSERT'}
 msg = "Please use %s rather than %s."
+from re import sub
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
 
 @r@
 identifier f =~ "abort|assert";
@@ -88,7 +90,7 @@ expression E1, E2;
 p << r.p;
 f << r.f;
 @@
-coccilib.org.print_todo(p[0], msg % (d[str(f)], f))
+coccilib.org.print_todo(p[0], orgmsg % (d[str(f)], f))
 
 @script:python depends on min && org@
 p << min.p;

+ 4 - 2
tools/dev/experimental/use_starpu_pthread_macros.cocci

@@ -43,6 +43,8 @@ d = {
 'pthread_spin_unlock'     : '_STARPU_PTHREAD_SPIN_UNLOCK'
 }
 msg = "Use %s instead of %s."
+from re import sub
+orgmsg = sub(r'(%[a-z])', r'=\1=', msg)
 
 @r@
 identifier f =~ "^pthread_";
@@ -66,9 +68,9 @@ p << r.p;
 f << r.f;
 @@
 if str(f) in d.keys():
-	coccilib.org.print_todo(p[0], msg % (d[str(f)], f))
+	coccilib.org.print_todo(p[0], orgmsg % (d[str(f)], f))
 else:
-	coccilib.org.print_todo(p[0], "Shouldn't %s be wrapped in a macro ?" % str(f))
+	coccilib.org.print_todo(p[0], "Shouldn't =%s= be wrapped in a macro ?" % str(f))
 
 
 //

tools/dev/check_unrenamed_list_types.sh → tools/dev/internal/check_unrenamed_list_types.sh


tools/dev/rename_internal.sed → tools/dev/internal/rename_internal.sed


tools/dev/rename_internal.sh → tools/dev/internal/rename_internal.sh


+ 1 - 1
tools/dev/mycocci.sh

@@ -121,7 +121,7 @@ do
 		scripts_dir=$OPTARG;
 		;;
 	t)
-		target=$OPTARG;
+		target="$target $OPTARG";
 		;;
 	\?)
 		echo "Invalid option -$OPTARG"

+ 0 - 15
tools/dev/starpu_use_macro.sed

@@ -1,15 +0,0 @@
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-#
-# Copyright (C) 2012 INRIA
-#
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-#
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-s/^#if STARPU_USE_/#ifdef STARPU_USE_/

+ 1 - 1
tools/dev/starpu_use_macro.sh

@@ -15,4 +15,4 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-find . -type f -name "*.c" -not -path "*svn*" -exec sed -i -f $(dirname $0)/starpu_use_macro.sed {} \;
+find . -type f -name "*.c" -not -path "*svn*" -exec sed -i 's/^#if STARPU_USE_/#ifdef STARPU_USE_/' {} \;

+ 23 - 1
tools/valgrind/starpu.suppr

@@ -2,7 +2,7 @@
    config.running is not racy from starpu_shutdown
    Helgrind:Race
    fun:starpu_shutdown
-   fun:main
+   ...
 }
 
 {
@@ -25,3 +25,25 @@
    fun:_starpu_msi_cache_miss
    ...
 }
+
+{
+   known race, but not problematic in practice, see comment in _starpu_tag_clear
+   Helgrind:LockOrder
+   ...
+   fun:_starpu_tag_free
+   fun:_starpu_htbl_clear_tags
+   ...
+   fun:_starpu_tag_clear
+   fun:starpu_shutdown
+   ...
+}
+
+
+{
+   There is actually no race on current_mode, because the mode can not change unexpectedly, until _starpu_notify_data_dependencies() is called further down. Valgrind can not know about such software rwlock.
+   Helgrind:Race
+   fun:_starpu_release_data_on_node
+   fun:_starpu_push_task_output
+   ...
+}
+