7 anni fa · 7020dfa060
--- a/doc/doxygen/chapters/110_basic_examples.doxy
+++ b/doc/doxygen/chapters/110_basic_examples.doxy
@@ -307,7 +307,7 @@ struct starpu_codelet cl =
 
				     .where = STARPU_CPU,
			
 
				     .cpu_funcs = { cpu_func },
			
 
				     .cpu_funcs_name = { "cpu_func" },
			
 
				-     .nbuffers = 0
			
 
				+    .nbuffers = 0
			
 
				 };
			
 
				 \endcode
			
 
				 
			
@@ -338,13 +338,10 @@ has to be defined:
 
				 
			
 
				 \code{.c}
			
 
				 /* Declare the `vector_scal' task.  */
			
 
				-static void vector_scal (unsigned size, float vector[size],
			
 
				-                         float factor)
			
 
				-  __attribute__ ((task));
			
 
				+static void vector_scal (unsigned size, float vector[size], float factor) __attribute__ ((task));
			
 
				 
			
 
				 /* Define the standard CPU implementation.  */
			
 
				-static void
			
 
				-vector_scal (unsigned size, float vector[size], float factor)
			
 
				+static void vector_scal (unsigned size, float vector[size], float factor)
			
 
				 {
			
 
				   unsigned i;
			
 
				   for (i = 0; i < size; i++)
			
@@ -415,12 +412,10 @@ in our C file like this:
 
				 /* The OpenCL programs, loaded from 'main' (see below). */
			
 
				 static struct starpu_opencl_program cl_programs;
			
 
				 
			
 
				-static void vector_scal_opencl (unsigned size, float vector[size],
			
 
				-                                float factor)
			
 
				+static void vector_scal_opencl (unsigned size, float vector[size], float factor)
			
 
				   __attribute__ ((task_implementation ("opencl", vector_scal)));
			
 
				 
			
 
				-static void
			
 
				-vector_scal_opencl (unsigned size, float vector[size], float factor)
			
 
				+static void vector_scal_opencl (unsigned size, float vector[size], float factor)
			
 
				 {
			
 
				   int id, devid, err;
			
 
				   cl_kernel kernel;
			
@@ -434,22 +429,17 @@ vector_scal_opencl (unsigned size, float vector[size], float factor)
 
				   devid = starpu_worker_get_devid (id);
			
 
				 
			
 
				   /* Prepare to invoke the kernel.  In the future, this will be largely automated.  */
			
 
				-  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
			
 
				-                                   "vector_mult_opencl", devid);
			
 
				-  if (err != CL_SUCCESS)
			
 
				-    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs, "vector_mult_opencl", devid);
			
 
				+  if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR (err);
			
 
				 
			
 
				   err = clSetKernelArg (kernel, 0, sizeof (size), &size);
			
 
				   err |= clSetKernelArg (kernel, 1, sizeof (val), &val);
			
 
				   err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
			
 
				-  if (err)
			
 
				-    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+  if (err) STARPU_OPENCL_REPORT_ERROR (err);
			
 
				 
			
 
				   size_t global = 1, local = 1;
			
 
				-  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global,
			
 
				-                                &local, 0, NULL, &event);
			
 
				-  if (err != CL_SUCCESS)
			
 
				-    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				+  if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR (err);
			
 
				 
			
 
				   clFinish (queue);
			
 
				   starpu_opencl_collect_stats (event);
			
@@ -464,8 +454,7 @@ The OpenCL kernel itself must be loaded from <c>main</c>, sometime after
 
				 the pragma <c>initialize</c>:
			
 
				 
			
 
				 \code{.c}
			
 
				-starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
			
 
				-                                       &cl_programs, "");
			
 
				+starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl", &cl_programs, "");
			
 
				 \endcode
			
 
				 
			
 
				 And that's it.  The task <c>vector_scal</c> now has an additional
			
@@ -482,8 +471,7 @@ with <c>nvcc</c>.  Thus, the C file only needs to contain an external
 
				 declaration for the task implementation:
			
 
				 
			
 
				 \code{.c}
			
 
				-extern void vector_scal_cuda (unsigned size, float vector[size],
			
 
				-                              float factor)
			
 
				+extern void vector_scal_cuda (unsigned size, float vector[size], float factor)
			
 
				   __attribute__ ((task_implementation ("cuda", vector_scal)));
			
 
				 \endcode
			
 
				 
			
@@ -532,8 +520,7 @@ The following lines show how to declare an array of <c>NX</c> elements of type
 
				 float vector[NX];
			
 
				 
			
 
				 starpu_data_handle_t vector_handle;
			
 
				-starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX,
			
 
				-                            sizeof(vector[0]));
			
 
				+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				 \endcode
			
 
				 
			
 
				 The first argument, called the <b>data handle</b>, is an opaque pointer which
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -408,9 +408,7 @@ void feed(void)
 
				         starpu_data_handle_t handle;
			
 
				 	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
			
 
				 	task.handles[0] = handle;
			
 
				-	starpu_perfmodel_update_history(&perf_model, &task,
			
 
				-	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
			
 
				-	                                measure->implementation, measure->time);
			
 
				+	starpu_perfmodel_update_history(&perf_model, &task, STARPU_CUDA_DEFAULT + measure->cudadev, 0, measure->implementation, measure->time);
			
 
				 	starpu_task_clean(&task);
			
 
				 	starpu_data_unregister(handle);
			
 
				     }
			
--- a/doc/doxygen/chapters/301_tasks.doxy
+++ b/doc/doxygen/chapters/301_tasks.doxy
@@ -98,9 +98,9 @@ for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
 
				 	handles[i] = handle;
			
 
				 }
			
 
				 starpu_task_insert(&dummy_big_cl,
			
 
				-        	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
			
 
				-		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
			
 
				-		 0);
			
 
				+         	  STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
			
 
				+		  STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
			
 
				+		  0);
			
 
				 \endcode
			
 
				 
			
 
				 The whole code for this complex data interface is available in the
			
@@ -355,9 +355,7 @@ starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
 
				 
			
 
				 /* And submit the corresponding task */
			
 
				 STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
			
 
				-                       starpu_task_insert(&work,
			
 
				-		                          STARPU_RW, A_handle[i],
			
 
				-					  0));
			
 
				+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
			
 
				 \endcode
			
 
				 
			
 
				 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
			
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -261,8 +261,7 @@ int vector[NX];
 
				 starpu_data_handle_t handle;
			
 
				 
			
 
				 /* Declare data to StarPU */
			
 
				-starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
			
 
				-                            NX, sizeof(vector[0]));
			
 
				+starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				 
			
 
				 /* Partition the vector in PARTS sub-vectors */
			
 
				 struct starpu_data_filter f =
			
@@ -428,8 +427,8 @@ struct starpu_codelet cl_switch =
 
				 };
			
 
				 
			
 
				 ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,
			
 
				-			STARPU_W, vert_handle[0], 
			
 
				-			STARPU_W, vert_handle[1], 
			
 
				+			STARPU_W, vert_handle[0],
			
 
				+			STARPU_W, vert_handle[1],
			
 
				 			0);
			
 
				 \endcode
			
 
				 
			
@@ -510,8 +509,7 @@ and attaches them as reduction methods for its handle <c>dtq</c>:
 
				 
			
 
				 \code{.c}
			
 
				 starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
			
 
				-starpu_data_set_reduction_methods(dtq_handle,
			
 
				-        &accumulate_variable_cl, &bzero_variable_cl);
			
 
				+starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl);
			
 
				 \endcode
			
 
				 
			
 
				 and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
			
@@ -554,8 +552,7 @@ with a new reduction:
 
				 for (i = 0; i < 100; i++)
			
 
				 {
			
 
				     starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
			
 
				-    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
			
 
				-               STARPU_R, B, STARPU_REDUX, res, 0);
			
 
				+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);
			
 
				     starpu_mpi_redux_data(MPI_COMM_WORLD, res);
			
 
				     starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
			
 
				 }
			
@@ -573,22 +570,10 @@ the ::STARPU_COMMUTE data access flag. Accesses without this flag will however
 
				 properly be serialized against accesses with this flag. For instance:
			
 
				 
			
 
				 \code{.c}
			
 
				-    starpu_task_insert(&cl1,
			
 
				-        STARPU_R, h,
			
 
				-        STARPU_RW, handle,
			
 
				-        0);
			
 
				-    starpu_task_insert(&cl2,
			
 
				-        STARPU_R, handle1,
			
 
				-        STARPU_RW|STARPU_COMMUTE, handle,
			
 
				-        0);
			
 
				-    starpu_task_insert(&cl2,
			
 
				-        STARPU_R, handle2,
			
 
				-        STARPU_RW|STARPU_COMMUTE, handle,
			
 
				-        0);
			
 
				-    starpu_task_insert(&cl3,
			
 
				-        STARPU_R, g,
			
 
				-        STARPU_RW, handle,
			
 
				-        0);
			
 
				+    starpu_task_insert(&cl1, STARPU_R, h, STARPU_RW, handle, 0);
			
 
				+    starpu_task_insert(&cl2, STARPU_R, handle1, STARPU_RW|STARPU_COMMUTE, handle, 0);
			
 
				+    starpu_task_insert(&cl2, STARPU_R, handle2, STARPU_RW|STARPU_COMMUTE, handle, 0);
			
 
				+    starpu_task_insert(&cl3, STARPU_R, g, STARPU_RW, handle, 0);
			
 
				 \endcode
			
 
				 
			
 
				 The two tasks running <c>cl2</c> will be able to commute: depending on whether the
			
@@ -680,8 +665,7 @@ memory for it, and StarPU will allocate it on demand at task execution.
 
				 \code{.c}
			
 
				 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
			
 
				 for (i = 0; i < N; i++)
			
 
				-    starpu_task_insert(&compute, STARPU_R, input[i],
			
 
				-                       STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
			
 
				+    starpu_task_insert(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
			
 
				 \endcode
			
 
				 
			
 
				 StarPU will make sure that the buffer is allocated before executing the task,
			
--- a/doc/doxygen/chapters/330_scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/330_scheduling_contexts.doxy
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2016                           Inria
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2014                           Université de Bordeaux
			
 
				  * Copyright (C) 2016                                     Uppsala University
			
 
				  *
			
@@ -113,7 +113,7 @@ The contexts can also be used to group set of SMs of an NVIDIA GPU in order to i
 
				 the parallel kernels and allow them to coexecution on a specified partiton of the GPU.
			
 
				 
			
 
				 Each context will be mapped to a stream and the user can indicate the number of SMs.
			
 
				-The context can be added to a larger context already grouping CPU cores. 
			
 
				+The context can be added to a larger context already grouping CPU cores.
			
 
				 This larger context can use a scheduling policy that assigns tasks to both CPUs and contexts (partitions of the GPU)
			
 
				 based on performance models adjusted to the number of SMs.
			
 
				 
			
@@ -132,7 +132,7 @@ int workers[ncpus+nstreams];
 
				 workers[ncpus+0] = stream_workerids[0];
			
 
				 workers[ncpus+1] = stream_workerids[1];
			
 
				 
			
 
				-big_sched_ctx = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0); 
			
 
				+big_sched_ctx = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
			
 
				 
			
 
				 starpu_task_submit_to_ctx(task, big_sched_ctx);
			
 
				 
			
@@ -160,25 +160,25 @@ starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
 
				 \endcode
			
 
				 
			
 
				 \section SubmittingTasksToAContext Submitting Tasks To A Context
			
 
				-The application may submit tasks to several contexts either 
			
 
				+The application may submit tasks to several contexts either
			
 
				 simultaneously or sequnetially. If several threads of submission
			
 
				 are used the function starpu_sched_ctx_set_context() may be called just
			
 
				-before starpu_task_submit(). Thus StarPU considers that 
			
 
				+before starpu_task_submit(). Thus StarPU considers that
			
 
				 the current thread will submit tasks to the coresponding context.
			
 
				- 
			
 
				+
			
 
				 When the application may not assign a thread of submission to each
			
 
				 context, the id of the context must be indicated by using the
			
 
				-function starpu_task_submit_to_ctx() or the field \ref STARPU_SCHED_CTX 
			
 
				+function starpu_task_submit_to_ctx() or the field \ref STARPU_SCHED_CTX
			
 
				 for starpu_task_insert().
			
 
				 
			
 
				 \section DeletingAContext Deleting A Context
			
 
				 
			
 
				 When a context is no longer needed it must be deleted. The application
			
 
				 can indicate which context should keep the resources of a deleted one.
			
 
				-All the tasks of the context should be executed before doing this. 
			
 
				-Thus, the programmer may use either a barrier and then delete the context 
			
 
				+All the tasks of the context should be executed before doing this.
			
 
				+Thus, the programmer may use either a barrier and then delete the context
			
 
				 directly, or just indicate
			
 
				-that other tasks will not be submitted later on to the context (such that when 
			
 
				+that other tasks will not be submitted later on to the context (such that when
			
 
				 the last task is executed its workers will be moved to the inheritor)
			
 
				 and delete the context at the end of the execution (when a barrier will
			
 
				 be used eventually).
			
@@ -212,8 +212,8 @@ A context may have no resources at the begining or at a certain
 
				 moment of the execution. Task can still be submitted to these contexts
			
 
				 and they will be executed as soon as the contexts will have resources. A list
			
 
				 of tasks pending to be executed is kept and when workers are added to
			
 
				-the contexts these tasks start being submitted. However, if resources 
			
 
				-are never allocated to the context the program will not terminate. 
			
 
				+the contexts these tasks start being submitted. However, if resources
			
 
				+are never allocated to the context the program will not terminate.
			
 
				 If these tasks have low
			
 
				 priority the programmer can forbid the application to submit them
			
 
				 by calling the function starpu_sched_ctx_stop_task_submission().
			
--- a/doc/doxygen/chapters/350_modularized_scheduler.doxy
+++ b/doc/doxygen/chapters/350_modularized_scheduler.doxy
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2013                                     Inria
			
 
				- * Copyright (C) 2014,2016-2017                           CNRS
			
 
				+ * Copyright (C) 2014,2016-2018                           CNRS
			
 
				  * Copyright (C) 2014,2017                                Université de Bordeaux
			
 
				  * Copyright (C) 2013                                     Simon Archipoff
			
 
				  *
			
@@ -204,23 +204,19 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
 
				     (sched_ctx_id, STARPU_WORKER_LIST);
			
 
				 
			
 
				   /* Create the Scheduling Tree */
			
 
				-  struct starpu_sched_tree * t =
			
 
				-    starpu_sched_tree_create(sched_ctx_id);
			
 
				+  struct starpu_sched_tree * t = starpu_sched_tree_create(sched_ctx_id);
			
 
				 
			
 
				   /* The Root Component is a Flow-control Fifo Component */
			
 
				    t->root = starpu_sched_component_fifo_create(NULL);
			
 
				 
			
 
				   /* The Resource-mapping Component of the strategy is an Eager Component
			
 
				    */
			
 
				-  struct starpu_sched_component * eager_component =
			
 
				-    starpu_sched_component_eager_create(NULL);
			
 
				+  struct starpu_sched_component *eager_component = starpu_sched_component_eager_create(NULL);
			
 
				 
			
 
				   /* Create links between Components : the Eager Component is the child
			
 
				    * of the Root Component */
			
 
				-  t->root->add_child
			
 
				-    (t->root, eager_component);
			
 
				-  eager_component->add_father
			
 
				-    (eager_component, t->root);
			
 
				+  t->root->add_child(t->root, eager_component);
			
 
				+  eager_component->add_father(eager_component, t->root);
			
 
				 
			
 
				   /* A task threshold is set for the Flow-control Components which will
			
 
				    * be connected to Worker Components. By doing so, this Modularized
			
@@ -233,44 +229,32 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
 
				   };
			
 
				 
			
 
				   unsigned i;
			
 
				-  for(i = 0;
			
 
				-    i < starpu_worker_get_count() +
			
 
				-    starpu_combined_worker_get_count();
			
 
				-    i++)
			
 
				+  for(i = 0; i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++)
			
 
				   {
			
 
				     /* Each Worker Component has a Flow-control Fifo Component as
			
 
				      * father */
			
 
				-    struct starpu_sched_component * worker_component =
			
 
				-	  starpu_sched_component_worker_new(i);
			
 
				-    struct starpu_sched_component * fifo_component =
			
 
				-	  starpu_sched_component_fifo_create(&fifo_data);
			
 
				-    fifo_component->add_child
			
 
				-      (fifo_component, worker_component);
			
 
				-    worker_component->add_father
			
 
				-      (worker_component, fifo_component);
			
 
				+    struct starpu_sched_component * worker_component = starpu_sched_component_worker_new(i);
			
 
				+    struct starpu_sched_component * fifo_component = starpu_sched_component_fifo_create(&fifo_data);
			
 
				+    fifo_component->add_child(fifo_component, worker_component);
			
 
				+    worker_component->add_father(worker_component, fifo_component);
			
 
				 
			
 
				     /* Each Flow-control Fifo Component associated to a Worker
			
 
				      * Component is linked to the Eager Component as one of its
			
 
				      * children */
			
 
				-    eager_component->add_child
			
 
				-      (eager_component, fifo_component);
			
 
				-    fifo_component->add_father
			
 
				-      (fifo_component, eager_component);
			
 
				+    eager_component->add_child(eager_component, fifo_component);
			
 
				+    fifo_component->add_father(fifo_component, eager_component);
			
 
				   }
			
 
				 
			
 
				   starpu_sched_tree_update_workers(t);
			
 
				-  starpu_sched_ctx_set_policy_data
			
 
				-    (sched_ctx_id, (void*)t);
			
 
				+  starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)t);
			
 
				 }
			
 
				 
			
 
				 /* Properly destroy the Scheduling Tree and all its Components */
			
 
				 static void deinitialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
			
 
				 {
			
 
				-  struct starpu_sched_tree * tree =
			
 
				-  	(struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+  struct starpu_sched_tree * tree = (struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				   starpu_sched_tree_destroy(tree);
			
 
				-  starpu_sched_ctx_delete_worker_collection
			
 
				-    (sched_ctx_id);
			
 
				+  starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
			
 
				 }
			
 
				 
			
 
				 /* Initializing the starpu_sched_policy struct associated to the Modularized
			
--- a/doc/doxygen/chapters/370_online_performance_tools.doxy
+++ b/doc/doxygen/chapters/370_online_performance_tools.doxy
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2011-2012,2016                           Inria
			
 
				- * Copyright (C) 2009-2011,2014,2016, 2018                      Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2011,2014,2016, 2018                Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -313,12 +313,9 @@ for (worker = 0; worker < starpu_worker_get_count(); worker++)
 
				         starpu_worker_get_name(worker, workername, 128);
			
 
				         fprintf(stderr, "Worker %s:\n", workername);
			
 
				         fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
			
 
				-        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n",
			
 
				-                executing_time*1e-3, executing_ratio);
			
 
				-        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n",
			
 
				-                sleeping_time*1e-3, sleeping_ratio);
			
 
				-        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n",
			
 
				-                overhead_time*1e-3, overhead_ratio);
			
 
				+        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
			
 
				+        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
			
 
				+        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n", overhead_time*1e-3, overhead_ratio);
			
 
				 }
			
 
				 \endcode
			
 
				 
			
--- a/doc/doxygen/chapters/390_faq.doxy
+++ b/doc/doxygen/chapters/390_faq.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2014,2016-2017                 Université de Bordeaux
			
 
				  * Copyright (C) 2011-2012                                Inria
			
 
				  *
			
@@ -159,8 +159,7 @@ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
 
				 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
			
 
				 
			
 
				 /* And register it to StarPU */
			
 
				-starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
			
 
				-                            output, num_bytes / sizeof(float4), sizeof(float4));
			
 
				+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
			
 
				 
			
 
				 /* The handle can now be used as usual */
			
 
				 starpu_task_insert(&cl, STARPU_RW, handle, 0);
			
--- a/doc/doxygen/chapters/401_out_of_core.doxy
+++ b/doc/doxygen/chapters/401_out_of_core.doxy
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2013-2014,2016-2017                      CNRS
			
 
				+ * Copyright (C) 2013-2014,2016-2018                      CNRS
			
 
				  * Copyright (C) 2013                                     Inria
			
 
				- * Copyright (C) 2013-2014,2017-2018                           Université de Bordeaux
			
 
				+ * Copyright (C) 2013-2014,2017-2018                      Université de Bordeaux
			
 
				  * Copyright (C) 2013                                     Corentin Salingue
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -51,7 +51,7 @@ automatically read and write data as appropriate.
 
				 To use a disk memory node, you have to register it with this function:
			
 
				 
			
 
				 \code{.c}
			
 
				-	int new_dd = starpu_disk_register(&starpu_disk_unistd_ops, (void *) "/tmp/", 1024*1024*200);
			
 
				+int new_dd = starpu_disk_register(&starpu_disk_unistd_ops, (void *) "/tmp/", 1024*1024*200);
			
 
				 \endcode
			
 
				 
			
 
				 Here, we use the unistd library to realize the read/write operations, i.e.
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -38,31 +38,31 @@ initializes a token on node 0, and the token is passed from node to node,
 
				 incremented by one on each step. The code is not using StarPU yet.
			
 
				 
			
 
				 \code{.c}
			
 
				-    for (loop = 0; loop < nloops; loop++)
			
 
				-    {
			
 
				-        int tag = loop*size + rank;
			
 
				+for (loop = 0; loop < nloops; loop++)
			
 
				+{
			
 
				+    int tag = loop*size + rank;
			
 
				 
			
 
				-        if (loop == 0 && rank == 0)
			
 
				-        {
			
 
				-            token = 0;
			
 
				-            fprintf(stdout, "Start with token value %d\n", token);
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
			
 
				-        }
			
 
				+    if (loop == 0 && rank == 0)
			
 
				+    {
			
 
				+        token = 0;
			
 
				+        fprintf(stdout, "Start with token value %d\n", token);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
			
 
				+    }
			
 
				 
			
 
				-        token++;
			
 
				+    token++;
			
 
				 
			
 
				-        if (loop == last_loop && rank == last_rank)
			
 
				-        {
			
 
				-            fprintf(stdout, "Finished: token value %d\n", token);
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
			
 
				-        }
			
 
				+    if (loop == last_loop && rank == last_rank)
			
 
				+    {
			
 
				+        fprintf(stdout, "Finished: token value %d\n", token);
			
 
				     }
			
 
				+    else
			
 
				+    {
			
 
				+        MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
			
 
				+    }
			
 
				+}
			
 
				 \endcode
			
 
				 
			
 
				 \section NotUsingMPISupport About not using the MPI support
			
@@ -73,39 +73,39 @@ execution to StarPU.  This is possible by just using starpu_data_acquire(), for
 
				 instance:
			
 
				 
			
 
				 \code{.c}
			
 
				-    for (loop = 0; loop < nloops; loop++)
			
 
				-    {
			
 
				-        int tag = loop*size + rank;
			
 
				+for (loop = 0; loop < nloops; loop++)
			
 
				+{
			
 
				+    int tag = loop*size + rank;
			
 
				 
			
 
				-	/* Acquire the data to be able to write to it */
			
 
				-	starpu_data_acquire(token_handle, STARPU_W);
			
 
				-        if (loop == 0 && rank == 0)
			
 
				-        {
			
 
				-            token = 0;
			
 
				-            fprintf(stdout, "Start with token value %d\n", token);
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
			
 
				-        }
			
 
				+    /* Acquire the data to be able to write to it */
			
 
				+    starpu_data_acquire(token_handle, STARPU_W);
			
 
				+    if (loop == 0 && rank == 0)
			
 
				+    {
			
 
				+        token = 0;
			
 
				+        fprintf(stdout, "Start with token value %d\n", token);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
			
 
				+    }
			
 
				 	starpu_data_release(token_handle);
			
 
				 
			
 
				-        /* Task delegation to StarPU to increment the token. The execution might
			
 
				-         * be performed on a CPU, a GPU, etc. */
			
 
				-        increment_token();
			
 
				+    /* Task delegation to StarPU to increment the token. The execution might
			
 
				+     * be performed on a CPU, a GPU, etc. */
			
 
				+    increment_token();
			
 
				 
			
 
				-	/* Acquire the update data to be able to read from it */
			
 
				-	starpu_data_acquire(token_handle, STARPU_R);
			
 
				-        if (loop == last_loop && rank == last_rank)
			
 
				-        {
			
 
				-            fprintf(stdout, "Finished: token value %d\n", token);
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
			
 
				-        }
			
 
				-	starpu_data_release(token_handle);
			
 
				+    /* Acquire the update data to be able to read from it */
			
 
				+    starpu_data_acquire(token_handle, STARPU_R);
			
 
				+    if (loop == last_loop && rank == last_rank)
			
 
				+    {
			
 
				+        fprintf(stdout, "Finished: token value %d\n", token);
			
 
				     }
			
 
				+    else
			
 
				+    {
			
 
				+        MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
			
 
				+    }
			
 
				+	starpu_data_release(token_handle);
			
 
				+}
			
 
				 \endcode
			
 
				 
			
 
				 In that case, <c>libstarpumpi</c> is not needed. One can also use <c>MPI_Isend()</c> and
			
@@ -167,8 +167,7 @@ int main(int argc, char **argv)
 
				         }
			
 
				         else
			
 
				         {
			
 
				-            starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag,
			
 
				-                    MPI_COMM_WORLD, NULL, NULL);
			
 
				+            starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, NULL, NULL);
			
 
				         }
			
 
				 
			
 
				         increment_token();
			
@@ -181,8 +180,7 @@ int main(int argc, char **argv)
 
				         }
			
 
				         else
			
 
				         {
			
 
				-            starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1,
			
 
				-                    MPI_COMM_WORLD, NULL, NULL);
			
 
				+            starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -316,14 +314,12 @@ static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **
 
				 {
			
 
				   STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-  struct starpu_complex_interface *complex_interface =
			
 
				-    (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				+  struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				   *count = complex_get_size(handle);
			
 
				   starpu_malloc_flags(ptr, *count, 0);
			
 
				   memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
			
 
				-  memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary,
			
 
				-         complex_interface->nx*sizeof(double));
			
 
				+  memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
			
 
				 
			
 
				   return 0;
			
 
				 }
			
@@ -332,12 +328,10 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 
				 {
			
 
				   STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-  struct starpu_complex_interface *complex_interface =
			
 
				-    (struct starpu_complex_interface *)	starpu_data_get_interface_on_node(handle, node);
			
 
				+  struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				   memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
			
 
				-  memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double),
			
 
				-         complex_interface->nx*sizeof(double));
			
 
				+  memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
			
 
				 
			
 
				   return 0;
			
 
				 }
			
@@ -369,8 +363,7 @@ void starpu_complex_interface_datatype_allocate(starpu_data_handle_t handle, MPI
 
				 	MPI_Aint displacements[2];
			
 
				 	MPI_Datatype types[2] = {MPI_DOUBLE, MPI_DOUBLE};
			
 
				 
			
 
				-	struct starpu_complex_interface *complex_interface =
			
 
				-          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				 
			
 
				 	MPI_Address(complex_interface, displacements);
			
 
				 	MPI_Address(&complex_interface->imaginary, displacements+1);
			
@@ -461,13 +454,11 @@ data which will be needed by the tasks that we will execute.
 
				             int mpi_rank = my_distrib(x, y, size);
			
 
				             if (mpi_rank == my_rank)
			
 
				                 /* Owning data */
			
 
				-                starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM,
			
 
				-                                              (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
			
 
				+                starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
			
 
				             else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				                   || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				                 /* I don't own this index, but will need it for my computations */
			
 
				-                starpu_variable_data_register(&data_handles[x][y], -1,
			
 
				-                                              (uintptr_t)NULL, sizeof(unsigned));
			
 
				+                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				             else
			
 
				                 /* I know it's useless to allocate anything for this */
			
 
				                 data_handles[x][y] = NULL;
			
@@ -600,7 +591,8 @@ can just pass NULL to starpu_mpi_task_insert():
 
				 
			
 
				 \code{.c}
			
 
				 starpu_data_handle_t data0 = NULL;
			
 
				-if (rank == 0) {
			
 
				+if (rank == 0)
			
 
				+{
			
 
				 	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
			
 
				 	starpu_mpi_data_register(data0, 0, rank);
			
 
				 }
			
@@ -615,12 +607,15 @@ data1 to node 0, which owns data and thus will need the value of data1 to execut
 
				 
			
 
				 \code{.c}
			
 
				 starpu_data_handle_t data0 = NULL, data1, data;
			
 
				-if (rank == 0) {
			
 
				+if (rank == 0)
			
 
				+{
			
 
				 	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
			
 
				 	starpu_mpi_data_register(data0, -1, rank);
			
 
				 	starpu_variable_data_register(&data1, -1, 0, sizeof(val1));
			
 
				 	starpu_variable_data_register(&data, STARPU_MAIN_RAM, (uintptr_t) &val, sizeof(val));
			
 
				-} else if (rank == 1) {
			
 
				+}
			
 
				+else if (rank == 1)
			
 
				+{
			
 
				 	starpu_variable_data_register(&data1, STARPU_MAIN_RAM, (uintptr_t) &val1, sizeof(val1));
			
 
				 	starpu_variable_data_register(&data, -1, 0, sizeof(val));
			
 
				 }
			
@@ -641,10 +636,13 @@ starpu_variable_data_register(&pernode, -1, 0, sizeof(val));
 
				 starpu_mpi_data_register(pernode, -1, STARPU_MPI_PER_NODE);
			
 
				 
			
 
				 /* Normal data: one on node0, one on node1 */
			
 
				-if (rank == 0) {
			
 
				+if (rank == 0)
			
 
				+{
			
 
				 	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
			
 
				 	starpu_variable_data_register(&data1, -1, 0, sizeof(val1));
			
 
				-} else if (rank == 1) {
			
 
				+}
			
 
				+else if (rank == 1)
			
 
				+{
			
 
				 	starpu_variable_data_register(&data0, -1, 0, sizeof(val1));
			
 
				 	starpu_variable_data_register(&data1, STARPU_MAIN_RAM, (uintptr_t) &val1, sizeof(val1));
			
 
				 }
			
@@ -744,8 +742,7 @@ migrate the data, and register the new location.
 
				                   || my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				                   || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size)))
			
 
				                 /* Register newly-needed data */
			
 
				-                starpu_variable_data_register(&data_handles[x][y], -1,
			
 
				-                                              (uintptr_t)NULL, sizeof(unsigned));
			
 
				+                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				             if (data_handles[x][y])
			
 
				 	    {
			
 
				                 /* Migrate the data */
			
@@ -808,14 +805,12 @@ for(x = 0; x < nblocks ;  x++)
 
				     int mpi_rank = my_distrib(x, nodes);
			
 
				     if (rank == root)
			
 
				     {
			
 
				-        starpu_vector_data_register(&data_handles[x], STARPU_MAIN_RAM, (uintptr_t)vector[x],
			
 
				-                                    blocks_size, sizeof(float));
			
 
				+        starpu_vector_data_register(&data_handles[x], STARPU_MAIN_RAM, (uintptr_t)vector[x], blocks_size, sizeof(float));
			
 
				     }
			
 
				     else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1)))
			
 
				     {
			
 
				         /* I own this index, or i will need it for my computations */
			
 
				-        starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
			
 
				-                                   block_size, sizeof(float));
			
 
				+        starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL, block_size, sizeof(float));
			
 
				     }
			
 
				     else
			
 
				     {
			
--- a/doc/doxygen/chapters/440_c_extensions.doxy
+++ b/doc/doxygen/chapters/440_c_extensions.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2014-2015                      Université de Bordeaux
			
 
				  * Copyright (C) 2011-2012                                Inria
			
 
				  *
			
@@ -126,20 +126,15 @@ Here is an example:
 
				 \code{.c}
			
 
				 #define __output  __attribute__ ((output))
			
 
				 
			
 
				-static void matmul (const float *A, const float *B,
			
 
				-                    __output float *C,
			
 
				-                    unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void matmul (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
			
 
				   __attribute__ ((task));
			
 
				 
			
 
				-static void matmul_cpu (const float *A, const float *B,
			
 
				-                        __output float *C,
			
 
				-                        unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void matmul_cpu (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
			
 
				   __attribute__ ((task_implementation ("cpu", matmul)));
			
 
				 
			
 
				 
			
 
				 static void
			
 
				-matmul_cpu (const float *A, const float *B, __output float *C,
			
 
				-            unsigned nx, unsigned ny, unsigned nz)
			
 
				+matmul_cpu (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
			
 
				 {
			
 
				   unsigned i, j, k;
			
 
				 
			
@@ -165,16 +160,12 @@ implementation.  Thus, the above snippet can be simplified like this:
 
				 \code{.c}
			
 
				 #define __output  __attribute__ ((output))
			
 
				 
			
 
				-static void matmul (const float *A, const float *B,
			
 
				-                    __output float *C,
			
 
				-                    unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void matmul (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
			
 
				   __attribute__ ((task));
			
 
				 
			
 
				 /* Implicit definition of the CPU implementation of the
			
 
				    `matmul' task.  */
			
 
				-static void
			
 
				-matmul (const float *A, const float *B, __output float *C,
			
 
				-        unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void matmul (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
			
 
				 {
			
 
				   unsigned i, j, k;
			
 
				 
			
@@ -194,12 +185,10 @@ the code is valid sequential code when StarPU's GCC plug-in is not used
 
				 CUDA and OpenCL implementations can be declared in a similar way:
			
 
				 
			
 
				 \code{.c}
			
 
				-static void matmul_cuda (const float *A, const float *B, float *C,
			
 
				-                         unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void matmul_cuda (const float *A, const float *B, float *C, unsigned nx, unsigned ny, unsigned nz)
			
 
				   __attribute__ ((task_implementation ("cuda", matmul)));
			
 
				 
			
 
				-static void matmul_opencl (const float *A, const float *B, float *C,
			
 
				-                           unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void matmul_opencl (const float *A, const float *B, float *C, unsigned nx, unsigned ny, unsigned nz)
			
 
				   __attribute__ ((task_implementation ("opencl", matmul)));
			
 
				 \endcode
			
 
				 
			
@@ -209,13 +198,9 @@ written in CUDA or OpenCL (for similar code, \ref CUDAKernel, and
 
				 OpenCL under the hood, such as CUBLAS functions:
			
 
				 
			
 
				 \code{.c}
			
 
				-static void
			
 
				-matmul_cuda (const float *A, const float *B, float *C,
			
 
				-             unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void matmul_cuda (const float *A, const float *B, float *C, unsigned nx, unsigned ny, unsigned nz)
			
 
				 {
			
 
				-  cublasSgemm ('n', 'n', nx, ny, nz,
			
 
				-               1.0f, A, 0, B, 0,
			
 
				-               0.0f, C, 0);
			
 
				+  cublasSgemm ('n', 'n', nx, ny, nz, 1.0f, A, 0, B, 0, 0.0f, C, 0);
			
 
				   cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				 }
			
 
				 \endcode
			
--- a/doc/doxygen/chapters/490_clustering_a_machine.doxy
+++ b/doc/doxygen/chapters/490_clustering_a_machine.doxy
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015-2017                                CNRS
			
 
				+ * Copyright (C) 2015-2018                                CNRS
			
 
				  * Copyright (C) 2015-2016                                Inria
			
 
				- * Copyright (C) 2015, 2018                                     Université de Bordeaux
			
 
				+ * Copyright (C) 2015, 2018                               Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -64,7 +64,7 @@ struct starpu_cluster_machine *clusters;
 
				 clusters = starpu_cluster_machine(HWLOC_OBJ_SOCKET, 0);
			
 
				 starpu_cluster_print(clusters);
			
 
				 
			
 
				-//... submit some tasks with OpenMP computations 
			
 
				+//... submit some tasks with OpenMP computations
			
 
				 
			
 
				 starpu_uncluster_machine(clusters);
			
 
				 //... we are back in the default starpu state
			
@@ -111,6 +111,7 @@ and create the aforementioned OpenMP threads constrained in the cluster's
 
				 resources set:
			
 
				 \code{.c}
			
 
				 void starpu_openmp_prologue(void * sched_ctx_id)
			
 
				+{
			
 
				   int sched_ctx = *(int*)sched_ctx_id;
			
 
				   int *cpuids = NULL;
			
 
				   int ncpuids = 0;
			
--- a/doc/doxygen/chapters/api/opencl_extensions.doxy
+++ b/doc/doxygen/chapters/api/opencl_extensions.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
			
 
				  * Copyright (C) 2011-2012                                Inria
			
 
				  *
			
@@ -82,10 +82,7 @@ Here an example:
 
				 int n;
			
 
				 cl_int err;
			
 
				 cl_kernel kernel;
			
 
				-n = starpu_opencl_set_kernel_args(&err, 2, &kernel,
			
 
				-                                  sizeof(foo), &foo,
			
 
				-                                  sizeof(bar), &bar,
			
 
				-                                  0);
			
 
				+n = starpu_opencl_set_kernel_args(&err, 2, &kernel, sizeof(foo), &foo, sizeof(bar), &bar, 0);
			
 
				 if (n != 2)
			
 
				    fprintf(stderr, "Error : %d\n", err);
			
 
				 \endcode
			
--- a/doc/doxygen/chapters/code/complex.c
+++ b/doc/doxygen/chapters/code/complex.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				+ * Copyright (C) 2010-2013,2015,2017,2018                 CNRS
			
 
				  * Copyright (C) 2010-2014                                Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -16,10 +16,7 @@
 
				  */
			
 
				 
			
 
				 //! [To be included. You should update doxygen if you see this text.]
			
 
				-#define STARPU_COMPLEX_GET_REAL(interface)	\
			
 
				-        (((struct starpu_complex_interface *)(interface))->real)
			
 
				-#define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
			
 
				-        (((struct starpu_complex_interface *)(interface))->imaginary)
			
 
				-#define STARPU_COMPLEX_GET_NX(interface)	\
			
 
				-        (((struct starpu_complex_interface *)(interface))->nx)
			
 
				+#define STARPU_COMPLEX_GET_REAL(interface)	(((struct starpu_complex_interface *)(interface))->real)
			
 
				+#define STARPU_COMPLEX_GET_IMAGINARY(interface)	(((struct starpu_complex_interface *)(interface))->imaginary)
			
 
				+#define STARPU_COMPLEX_GET_NX(interface)	(((struct starpu_complex_interface *)(interface))->nx)
			
 
				 //! [To be included. You should update doxygen if you see this text.]
			
--- a/doc/doxygen/chapters/code/disk_compute.c
+++ b/doc/doxygen/chapters/code/disk_compute.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2013,2017                                CNRS
			
 
				+ * Copyright (C) 2013,2017,2018                           CNRS
			
 
				  * Copyright (C) 2013                                     Inria
			
 
				  * Copyright (C) 2014                                     Université de Bordeaux
			
 
				  * Copyright (C) 2013                                     Corentin Salingue
			
@@ -79,9 +79,6 @@ int main(int argc, char **argv)
 
				 		C[j] = 0;
			
 
				 	}
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				 	/* you create a file to store the vector ON the disk */
			
 
				 	FILE * f = fopen(path_file_start, "wb+");
			
 
				 	if (f == NULL)
			
--- a/doc/doxygen/chapters/code/matmul_pragma2.c
+++ b/doc/doxygen/chapters/code/matmul_pragma2.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				+ * Copyright (C) 2010-2013,2015,2017,2018                 CNRS
			
 
				  * Copyright (C) 2010-2014                                Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -24,6 +24,5 @@
 
				 # define __task
			
 
				 #endif
			
 
				 
			
 
				-static void matmul (const float *A, const float *B, float *C,
			
 
				-                    unsigned nx, unsigned ny, unsigned nz) __task;
			
 
				+static void matmul (const float *A, const float *B, float *C, unsigned nx, unsigned ny, unsigned nz) __task;
			
 
				 //! [To be included. You should update doxygen if you see this text.]
			
--- a/doc/doxygen/chapters/code/scal_pragma.cu
+++ b/doc/doxygen/chapters/code/scal_pragma.cu
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				+ * Copyright (C) 2010-2013,2015,2017,2018                 CNRS
			
 
				  * Copyright (C) 2010-2014,2016                           Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -21,8 +21,7 @@
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				 
			
 
				-static __global__ void
			
 
				-vector_mult_cuda (unsigned n, float *val, float factor)
			
 
				+static __global__ void vector_mult_cuda (unsigned n, float *val, float factor)
			
 
				 {
			
 
				   unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				 
			
@@ -36,8 +35,7 @@ extern "C" void vector_scal_cuda (size_t size, float vector[], float factor)
 
				   unsigned threads_per_block = 64;
			
 
				   unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
			
 
				 
			
 
				-  vector_mult_cuda <<< nblocks, threads_per_block, 0,
			
 
				-    starpu_cuda_get_local_stream () >>> (size, vector, factor);
			
 
				+  vector_mult_cuda <<< nblocks, threads_per_block, 0, starpu_cuda_get_local_stream () >>> (size, vector, factor);
			
 
				 
			
 
				   cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				 }
			
--- a/doc/doxygen/chapters/code/vector_scal_c.c
+++ b/doc/doxygen/chapters/code/vector_scal_c.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				+ * Copyright (C) 2010-2013,2015,2017,2018                 CNRS
			
 
				  * Copyright (C) 2013                                     Inria
			
 
				  * Copyright (C) 2010-2014                                Université de Bordeaux
			
 
				  *
			
@@ -70,8 +70,7 @@ int main(int argc, char **argv)
 
				     starpu_init(NULL);
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        starpu_opencl_load_opencl_from_file(
			
 
				-               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
			
 
				+    starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
			
 
				 #endif
			
 
				 
			
 
				     /* Tell StaPU to associate the "vector" vector with the "vector_handle"
			
@@ -88,8 +87,7 @@ int main(int argc, char **argv)
 
				      *  - the fifth argument is the size of each element.
			
 
				      */
			
 
				     starpu_data_handle_t vector_handle;
			
 
				-    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
			
 
				-                                NX, sizeof(vector[0]));
			
 
				+    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				 
			
 
				     float factor = 3.14;
			
 
				 
			
--- a/doc/doxygen/chapters/code/vector_scal_cuda.cu
+++ b/doc/doxygen/chapters/code/vector_scal_cuda.cu
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				+ * Copyright (C) 2010-2013,2015,2017,2018                      CNRS
			
 
				  * Copyright (C) 2010,2014                                Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -17,8 +17,7 @@
 
				 //! [To be included. You should update doxygen if you see this text.]
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-static __global__ void vector_mult_cuda(unsigned n, float *val,
			
 
				-                                        float factor)
			
 
				+static __global__ void vector_mult_cuda(unsigned n, float *val, float factor)
			
 
				 {
			
 
				         unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				         if (i < n)
			
@@ -36,8 +35,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				         unsigned threads_per_block = 64;
			
 
				         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				-        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
			
 
				-	                (n, val, *factor);
			
 
				+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
			
 
				 
			
 
				         cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/doc/doxygen/chapters/code/vector_scal_opencl.c
+++ b/doc/doxygen/chapters/code/vector_scal_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2012-2013,2015,2017                 CNRS
			
 
				+ * Copyright (C) 2010,2012-2013,2015,2017,2018            CNRS
			
 
				  * Copyright (C) 2011,2014                                Université de Bordeaux
			
 
				  * Copyright (C) 2010                                     Inria
			
 
				  *
			
@@ -38,8 +38,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 	 id = starpu_worker_get_id();
			
 
				 	 devid = starpu_worker_get_devid(id);
			
 
				 
			
 
				-	 err = starpu_opencl_load_kernel(&kernel, &queue,
			
 
				-					 &programs,
			
 
				+	 err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
			
 
				 					 "vector_mult_opencl", /* Name of the codelet */
			
 
				 					 devid);
			
 
				 	 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
@@ -57,13 +56,11 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				         cl_device_id device;
			
 
				 
			
 
				         starpu_opencl_get_device(devid, &device);
			
 
				-        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
			
 
				-                                        sizeof(local), &local, &s);
			
 
				+        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				         if (local > global) local=global;
			
 
				 
			
 
				-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
			
 
				-                                     NULL, &event);
			
 
				+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				     }