13 years ago · 5fd90a789a
--- a/doc/advanced-api.texi
+++ b/doc/advanced-api.texi
@@ -0,0 +1,223 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node StarPU Advanced API
			
 
				+@chapter StarPU Advanced API
			
 
				+
			
 
				+@menu
			
 
				+* Defining a new data interface::  
			
 
				+* Multiformat Data Interface::  
			
 
				+* Defining a new scheduling policy::  
			
 
				+@end menu
			
 
				+
			
 
				+@node Defining a new data interface
			
 
				+@section Defining a new data interface
			
 
				+
			
 
				+@menu
			
 
				+* Data Interface API::  Data Interface API
			
 
				+* An example of data interface::        An example of data interface
			
 
				+@end menu
			
 
				+
			
 
				+@node Data Interface API
			
 
				+@subsection Data Interface API
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_data_interface_ops}
			
 
				+@anchor{struct starpu_data_interface_ops}
			
 
				+Defines the per-interface methods. TODO describe all the different fields
			
 
				+@end deftp
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_data_copy_methods}
			
 
				+Per-interface data transfer methods. TODO describe all the different fields
			
 
				+@end deftp
			
 
				+
			
 
				+@node An example of data interface
			
 
				+@subsection An example of data interface
			
 
				+
			
 
				+TODO
			
 
				+See @code{src/datawizard/interfaces/vector_interface.c} for now.
			
 
				+
			
 
				+@node Multiformat Data Interface
			
 
				+@section Multiformat Data Interface
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_multiformat_data_interface_ops}
			
 
				+todo. The different fields are:
			
 
				+@table @asis
			
 
				+@item @code{cpu_elemsize} the size of each element on CPUs,
			
 
				+@item @code{opencl_elemsize} the size of each element on OpenCL devices,
			
 
				+@item @code{cuda_elemsize} the size of each element on CUDA devices,
			
 
				+@item @code{cpu_to_opencl_cl} pointer to a codelet which converts from CPU to OpenCL
			
 
				+@item @code{opencl_to_cpu_cl} pointer to a codelet which converts from OpenCL to CPU
			
 
				+@item @code{cpu_to_cuda_cl} pointer to a codelet which converts from CPU to CUDA
			
 
				+@item @code{cuda_to_cpu_cl} pointer to a codelet which converts from CUDA to CPU
			
 
				+@end table
			
 
				+@end deftp
			
 
				+
			
 
				+@deftypefun void starpu_multiformat_data_register (starpu_data_handle *@var{handle}, uint32_t @var{home_node}, void *@var{ptr}, uint32_t @var{nobjects}, struct starpu_multiformat_data_interface_ops *@var{format_ops});
			
 
				+Register a piece of data that can be represented in different ways, depending upon
			
 
				+the processing unit that manipulates it. It allows the programmer, for instance, to
			
 
				+use an array of structures when working on a CPU, and a structure of arrays when
			
 
				+working on a GPU.
			
 
				+
			
 
				+@var{nobjects} is the number of elements in the data. @var{format_ops} describes
			
 
				+the format.
			
 
				+
			
 
				+@example
			
 
				+#define NX 1024
			
 
				+struct point array_of_structs[NX];
			
 
				+starpu_data_handle handle;
			
 
				+
			
 
				+/*
			
 
				+ * The conversion of a piece of data is itself a task, though it is created,
			
 
				+ * submitted and destroyed by StarPU internals and not by the user. Therefore,
			
 
				+ * we have to define two codelets.
			
 
				+ * Note that for now the conversion from the CPU format to the GPU format has to
			
 
				+ * be executed on the GPU, and the conversion from the GPU to the CPU has to be
			
 
				+ * executed on the CPU.
			
 
				+ */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void cpu_to_opencl_opencl_func(void *buffers[], void *args);
			
 
				+starpu_codelet cpu_to_opencl_cl = @{
			
 
				+	.where = STARPU_OPENCL,
			
 
				+	.opencl_func = cpu_to_opencl_opencl_func,
			
 
				+	.nbuffers = 1
			
 
				+@};
			
 
				+
			
 
				+void opencl_to_cpu_func(void *buffers[], void *args);
			
 
				+starpu_codelet opencl_to_cpu_cl = @{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_func = opencl_to_cpu_func,
			
 
				+	.nbuffers = 1
			
 
				+@};
			
 
				+#endif
			
 
				+
			
 
				+struct starpu_multiformat_data_interface_ops format_ops = @{
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_elemsize = 2 * sizeof(float),
			
 
				+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
			
 
				+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
			
 
				+#endif
			
 
				+	.cpu_elemsize = 2 * sizeof(float),
			
 
				+	...
			
 
				+@};
			
 
				+starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
			
 
				+@end example
			
 
				+@end deftypefun
			
 
				+
			
 
				+@node Defining a new scheduling policy
			
 
				+@section Defining a new scheduling policy
			
 
				+
			
 
				+TODO
			
 
				+
			
 
				+A full example showing how to define a new scheduling policy is available in
			
 
				+the StarPU sources in the directory @code{examples/scheduler/}.
			
 
				+
			
 
				+@menu
			
 
				+* Scheduling Policy API:: Scheduling Policy API
			
 
				+* Source code::                 
			
 
				+@end menu
			
 
				+
			
 
				+@node Scheduling Policy API
			
 
				+@subsection Scheduling Policy API
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_sched_policy}
			
 
				+This structure contains all the methods that implement a scheduling policy.  An
			
 
				+application may specify which scheduling strategy in the @code{sched_policy}
			
 
				+field of the @code{starpu_conf} structure passed to the @code{starpu_init}
			
 
				+function. The different fields are:
			
 
				+@table @asis
			
 
				+@item @code{init_sched}
			
 
				+Initialize the scheduling policy.
			
 
				+@item @code{deinit_sched}
			
 
				+Cleanup the scheduling policy.
			
 
				+@item @code{push_task}
			
 
				+Insert a task into the scheduler.
			
 
				+@item @code{push_prio_task}
			
 
				+Insert a priority task into the scheduler.
			
 
				+@item @code{push_prio_notify}
			
 
				+Notify the scheduler that a task was pushed on the worker. This method is
			
 
				+called when a task that was explicitely assigned to a worker is scheduled. This
			
 
				+method therefore permits to keep the state of of the scheduler coherent even
			
 
				+when StarPU bypasses the scheduling strategy.
			
 
				+@item @code{pop_task}
			
 
				+Get a task from the scheduler. The mutex associated to the worker is already
			
 
				+taken when this method is called. If this method is defined as @code{NULL}, the
			
 
				+worker will only execute tasks from its local queue. In this case, the
			
 
				+@code{push_task} method should use the @code{starpu_push_local_task} method to
			
 
				+assign tasks to the different workers.
			
 
				+@item @code{pop_every_task}
			
 
				+Remove all available tasks from the scheduler (tasks are chained by the means
			
 
				+of the prev and next fields of the starpu_task structure). The mutex associated
			
 
				+to the worker is already taken when this method is called. 
			
 
				+@item @code{post_exec_hook} (optional)
			
 
				+This method is called every time a task has been executed.
			
 
				+@item @code{policy_name}
			
 
				+Name of the policy (optional).
			
 
				+@item @code{policy_description}
			
 
				+Description of the policy (optionnal).
			
 
				+@end table
			
 
				+@end deftp
			
 
				+
			
 
				+@deftypefun void starpu_worker_set_sched_condition (int @var{workerid}, pthread_cond_t *@var{sched_cond}, pthread_mutex_t *@var{sched_mutex})
			
 
				+This function specifies the condition variable associated to a worker
			
 
				+When there is no available task for a worker, StarPU blocks this worker on a
			
 
				+condition variable. This function specifies which condition variable (and the
			
 
				+associated mutex) should be used to block (and to wake up) a worker. Note that
			
 
				+multiple workers may use the same condition variable. For instance, in the case
			
 
				+of a scheduling strategy with a single task queue, the same condition variable
			
 
				+would be used to block and wake up all workers.
			
 
				+The initialization method of a scheduling strategy (@code{init_sched}) must
			
 
				+call this function once per worker.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_sched_set_min_priority (int @var{min_prio})
			
 
				+Defines the minimum priority level supported by the scheduling policy. The
			
 
				+default minimum priority level is the same as the default priority level which
			
 
				+is 0 by convention.  The application may access that value by calling the
			
 
				+@code{starpu_sched_get_min_priority} function. This function should only be
			
 
				+called from the initialization method of the scheduling policy, and should not
			
 
				+be used directly from the application.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_sched_set_min_priority (int @var{max_prio})
			
 
				+Defines the maximum priority level supported by the scheduling policy. The
			
 
				+default maximum priority level is 1.  The application may access that value by
			
 
				+calling the @code{starpu_sched_get_max_priority} function. This function should
			
 
				+only be called from the initialization method of the scheduling policy, and
			
 
				+should not be used directly from the application.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_push_local_task (int @var{workerid}, {struct starpu_task} *@var{task}, int @var{back})
			
 
				+The scheduling policy may put tasks directly into a worker's local queue so
			
 
				+that it is not always necessary to create its own queue when the local queue
			
 
				+is sufficient. If "back" not null, the task is put at the back of the queue
			
 
				+where the worker will pop tasks first. Setting "back" to 0 therefore ensures
			
 
				+a FIFO ordering.
			
 
				+@end deftypefun
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+@node Source code
			
 
				+@subsection Source code
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static struct starpu_sched_policy dummy_sched_policy = @{
			
 
				+    .init_sched = init_dummy_sched,
			
 
				+    .deinit_sched = deinit_dummy_sched,
			
 
				+    .push_task = push_task_dummy,
			
 
				+    .push_prio_task = NULL,
			
 
				+    .pop_task = pop_task_dummy,
			
 
				+    .post_exec_hook = NULL,
			
 
				+    .pop_every_task = NULL,
			
 
				+    .policy_name = "dummy",
			
 
				+    .policy_description = "dummy scheduling strategy"
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
--- a/doc/advanced-examples.texi
+++ b/doc/advanced-examples.texi
@@ -0,0 +1,471 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Advanced Examples
			
 
				+@chapter Advanced Examples
			
 
				+
			
 
				+@menu
			
 
				+* Using multiple implementations of a codelet::
			
 
				+* Task and Worker Profiling::   
			
 
				+* Partitioning Data::           Partitioning Data
			
 
				+* Performance model example::   
			
 
				+* Theoretical lower bound on execution time::  
			
 
				+* Insert Task Utility::          
			
 
				+* More examples::               More examples shipped with StarPU
			
 
				+* Debugging::                   When things go wrong.
			
 
				+@end menu
			
 
				+
			
 
				+@node Using multiple implementations of a codelet
			
 
				+@section Using multiple implementations of a codelet
			
 
				+One may want to write multiple implementations of a codelet for a single type of
			
 
				+device and let StarPU choose which one to run. As an example, we will show how
			
 
				+to use SSE to scale a vector. The codelet can be written as follows :
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <xmmintrin.h>
			
 
				+
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+	unsigned int n_iterations = n/4;
			
 
				+	if (n % 4 != 0)
			
 
				+		n_iterations++;
			
 
				+
			
 
				+	__m128 *VECTOR = (__m128*) vector;
			
 
				+	__m128 factor __attribute__((aligned(16)));
			
 
				+	factor = _mm_set1_ps(*(float *) cl_arg);
			
 
				+
			
 
				+	unsigned int i;	
			
 
				+	for (i = 0; i < n_iterations; i++)
			
 
				+		VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The @code{cpu_func} field of the @code{starpu_codelet} structure has to be set
			
 
				+to the special value @code{STARPU_MULTIPLE_CPU_IMPLEMENTATIONS}. Note that
			
 
				+@code{STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS} and
			
 
				+@code{STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS} are also available.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_codelet cl = @{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
			
 
				+	.cpu_funcs = @{ scal_cpu_func, scal_sse_func @},
			
 
				+	.nbuffers = 1
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The scheduler will measure the performance of all the implementations it was
			
 
				+given, and pick the one that seems to be the fastest.
			
 
				+@node Task and Worker Profiling
			
 
				+@section Task and Worker Profiling
			
 
				+
			
 
				+A full example showing how to use the profiling API is available in
			
 
				+the StarPU sources in the directory @code{examples/profiling/}.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+struct starpu_task *task = starpu_task_create();
			
 
				+task->cl = &cl;
			
 
				+task->synchronous = 1;
			
 
				+/* We will destroy the task structure by hand so that we can
			
 
				+ * query the profiling info before the task is destroyed. */
			
 
				+task->destroy = 0;
			
 
				+
			
 
				+/* Submit and wait for completion (since synchronous was set to 1) */
			
 
				+starpu_task_submit(task);
			
 
				+
			
 
				+/* The task is finished, get profiling information */
			
 
				+struct starpu_task_profiling_info *info = task->profiling_info;
			
 
				+
			
 
				+/* How much time did it take before the task started ? */
			
 
				+double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
			
 
				+
			
 
				+/* How long was the task execution ? */
			
 
				+double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+
			
 
				+/* We don't need the task structure anymore */
			
 
				+starpu_task_destroy(task);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Display the occupancy of all workers during the test */
			
 
				+int worker;
			
 
				+for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+@{
			
 
				+        struct starpu_worker_profiling_info worker_info;
			
 
				+        int ret = starpu_worker_get_profiling_info(worker, &worker_info);
			
 
				+        STARPU_ASSERT(!ret);
			
 
				+
			
 
				+        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
			
 
				+        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
			
 
				+        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
			
 
				+
			
 
				+        float executing_ratio = 100.0*executing_time/total_time;
			
 
				+        float sleeping_ratio = 100.0*sleeping_time/total_time;
			
 
				+
			
 
				+        char workername[128];
			
 
				+        starpu_worker_get_name(worker, workername, 128);
			
 
				+        fprintf(stderr, "Worker %s:\n", workername);
			
 
				+        fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
			
 
				+        fprintf(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3,
			
 
				+                executing_ratio);
			
 
				+        fprintf(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
			
 
				+                sleeping_ratio);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Partitioning Data
			
 
				+@section Partitioning Data
			
 
				+
			
 
				+An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+int vector[NX];
			
 
				+starpu_data_handle handle;
			
 
				+
			
 
				+/* Declare data to StarPU */
			
 
				+starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				+
			
 
				+/* Partition the vector in PARTS sub-vectors */
			
 
				+starpu_filter f =
			
 
				+@{
			
 
				+    .filter_func = starpu_block_filter_func_vector,
			
 
				+    .nchildren = PARTS
			
 
				+@};
			
 
				+starpu_data_partition(handle, &f);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Submit a task on each sub-vector */
			
 
				+for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
			
 
				+    /* Get subdata number i (there is only 1 dimension) */
			
 
				+    starpu_data_handle sub_handle = starpu_data_get_sub_data(handle, 1, i);
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+    task->buffers[0].handle = sub_handle;
			
 
				+    task->buffers[0].mode = STARPU_RW;
			
 
				+    task->cl = &cl;
			
 
				+    task->synchronous = 1;
			
 
				+    task->cl_arg = &factor;
			
 
				+    task->cl_arg_size = sizeof(factor);
			
 
				+
			
 
				+    starpu_task_submit(task);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Partitioning can be applied several times, see
			
 
				+@code{examples/basic_examples/mult.c} and @code{examples/filters/}.
			
 
				+
			
 
				+@node Performance model example
			
 
				+@section Performance model example
			
 
				+
			
 
				+To achieve good scheduling, StarPU scheduling policies need to be able to
			
 
				+estimate in advance the duration of a task. This is done by giving to codelets
			
 
				+a performance model, by defining a @code{starpu_perfmodel} structure and
			
 
				+providing its address in the @code{model} field of the @code{starpu_codelet}
			
 
				+structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
			
 
				+are mandatory, to give a name to the model, and the type of the model, since
			
 
				+there are several kinds of performance models.
			
 
				+
			
 
				+@itemize
			
 
				+@item
			
 
				+Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
			
 
				+given set of data input/output sizes, the performance will always be about the
			
 
				+same. This is very true for regular kernels on GPUs for instance (<0.1% error),
			
 
				+and just a bit less true on CPUs (~=1% error). This also assumes that there are
			
 
				+few different sets of data input/output sizes. StarPU will then keep record of
			
 
				+the average time of previous executions on the various processing units, and use
			
 
				+it as an estimation. History is done per task size, by using a hash of the input
			
 
				+and ouput sizes as an index.
			
 
				+It will also save it in @code{~/.starpu/sampling/codelets}
			
 
				+for further executions, and can be observed by using the
			
 
				+@code{starpu_perfmodel_display} command, or drawn by using
			
 
				+the @code{starpu_perfmodel_plot}.  The models are indexed by machine name. To
			
 
				+share the models between machines (e.g. for a homogeneous cluster), use
			
 
				+@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
			
 
				+
			
 
				+The following is a small code example.
			
 
				+
			
 
				+If e.g. the code is recompiled with other compilation options, or several
			
 
				+variants of the code are used, the symbol string should be changed to reflect
			
 
				+that, in order to recalibrate a new model from zero. The symbol string can even
			
 
				+be constructed dynamically at execution time, as long as this is done before
			
 
				+submitting any task using it.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static struct starpu_perfmodel mult_perf_model = @{
			
 
				+    .type = STARPU_HISTORY_BASED,
			
 
				+    .symbol = "mult_perf_model"
			
 
				+@};
			
 
				+
			
 
				+starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_func = cpu_mult,
			
 
				+    .nbuffers = 3,
			
 
				+    /* for the scheduling policy to be able to use performance models */
			
 
				+    .model = &mult_perf_model
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@item
			
 
				+Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
			
 
				+model type). This still assumes performance regularity, but can work
			
 
				+with various data input sizes, by applying regression over observed
			
 
				+execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
			
 
				+form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
			
 
				+STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
			
 
				+@code{tests/perfmodels/regression_based.c} uses a regression-based performance
			
 
				+model for the @code{memset} operation.
			
 
				+
			
 
				+@item
			
 
				+Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
			
 
				+see for instance
			
 
				+@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
			
 
				+
			
 
				+@item
			
 
				+Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
			
 
				+@code{.per_arch[i].cost_model} fields have to be filled with pointers to
			
 
				+functions which return the expected duration of the task in micro-seconds, one
			
 
				+per architecture.
			
 
				+
			
 
				+@end itemize
			
 
				+
			
 
				+How to use schedulers which can benefit from such performance model is explained
			
 
				+in @ref{Task scheduling policy}.
			
 
				+
			
 
				+The same can be done for task power consumption estimation, by setting the
			
 
				+@code{power_model} field the same way as the @code{model} field. Note: for
			
 
				+now, the application has to give to the power consumption performance model
			
 
				+a name which is different from the execution time performance model.
			
 
				+
			
 
				+The application can request time estimations from the StarPU performance
			
 
				+models by filling a task structure as usual without actually submitting
			
 
				+it. The data handles can be created by calling @code{starpu_data_register}
			
 
				+functions with a @code{NULL} pointer (and need to be unregistered as usual)
			
 
				+and the desired data sizes. The @code{starpu_task_expected_length} and
			
 
				+@code{starpu_task_expected_power} functions can then be called to get an
			
 
				+estimation of the task duration on a given arch. @code{starpu_task_destroy}
			
 
				+needs to be called to destroy the dummy task afterwards. See
			
 
				+@code{tests/perfmodels/regression_based.c} for an example.
			
 
				+
			
 
				+@node Theoretical lower bound on execution time
			
 
				+@section Theoretical lower bound on execution time
			
 
				+
			
 
				+For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
			
 
				+bound for the execution time of a whole set of tasks. See for
			
 
				+instance @code{examples/lu/lu_example.c}: before submitting tasks,
			
 
				+call @code{starpu_bound_start}, and after complete execution, call
			
 
				+@code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
			
 
				+@code{starpu_bound_print_mps} can then be used to output a Linear Programming
			
 
				+problem corresponding to the schedule of your tasks. Run it through
			
 
				+@code{lp_solve} or any other linear programming solver, and that will give you a
			
 
				+lower bound for the total execution time of your tasks. If StarPU was compiled
			
 
				+with the glpk library installed, @code{starpu_bound_compute} can be used to
			
 
				+solve it immediately and get the optimized minimum, in ms. Its @code{integer}
			
 
				+parameter allows to decide whether integer resolution should be computed
			
 
				+and returned too.
			
 
				+
			
 
				+The @code{deps} parameter tells StarPU whether to take tasks and implicit data
			
 
				+dependencies into account. It must be understood that the linear programming
			
 
				+problem size is quadratic with the number of tasks and thus the time to solve it
			
 
				+will be very long, it could be minutes for just a few dozen tasks. You should
			
 
				+probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
			
 
				+problem to MPS format and then use a better solver, @code{glpsol} might be
			
 
				+better than @code{lp_solve} for instance (the @code{--pcost} option may be
			
 
				+useful), but sometimes doesn't manage to converge. @code{cbc} might look
			
 
				+slower, but it is parallel. Be sure to try at least all the @code{-B} options
			
 
				+of @code{lp_solve}. For instance, we often just use
			
 
				+@code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
			
 
				+the @code{-gr} option can also be quite useful.
			
 
				+
			
 
				+Setting @code{deps} to 0 will only take into account the actual computations
			
 
				+on processing units. It however still properly takes into account the varying
			
 
				+performances of kernels and processing units, which is quite more accurate than
			
 
				+just comparing StarPU performances with the fastest of the kernels being used.
			
 
				+
			
 
				+The @code{prio} parameter tells StarPU whether to simulate taking into account
			
 
				+the priorities as the StarPU scheduler would, i.e. schedule prioritized
			
 
				+tasks before less prioritized tasks, to check to which extend this results
			
 
				+to a less optimal solution. This increases even more computation time.
			
 
				+
			
 
				+Note that for simplicity, all this however doesn't take into account data
			
 
				+transfers, which are assumed to be completely overlapped.
			
 
				+
			
 
				+@node Insert Task Utility
			
 
				+@section Insert Task Utility
			
 
				+
			
 
				+StarPU provides the wrapper function @code{starpu_insert_task} to ease
			
 
				+the creation and submission of tasks.
			
 
				+
			
 
				+@deftypefun int starpu_insert_task (starpu_codelet *@var{cl}, ...)
			
 
				+Create and submit a task corresponding to @var{cl} with the following
			
 
				+arguments.  The argument list must be zero-terminated.
			
 
				+
			
 
				+The arguments following the codelets can be of the following types:
			
 
				+
			
 
				+@itemize
			
 
				+@item
			
 
				+@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
			
 
				+@item
			
 
				+@code{STARPU_VALUE} followed  by a pointer to a constant value and
			
 
				+the size of the constant;
			
 
				+@item
			
 
				+@code{STARPU_CALLBACK} followed by a pointer to a callback function;
			
 
				+@item
			
 
				+@code{STARPU_CALLBACK_ARG} followed by a pointer to be given as an
			
 
				+argument to the callback function;
			
 
				+@item
			
 
				+@code{STARPU_CALLBACK_WITH_ARG} followed by two pointers: one to a callback
			
 
				+function, and the other to be given as an argument to the callback
			
 
				+function; this is equivalent to using both @code{STARPU_CALLBACK} and
			
 
				+@code{STARPU_CALLBACK_WITH_ARG}
			
 
				+@item
			
 
				+@code{STARPU_PRIORITY} followed by a integer defining a priority level.
			
 
				+@end itemize
			
 
				+
			
 
				+Parameters to be passed to the codelet implementation are defined
			
 
				+through the type @code{STARPU_VALUE}. The function
			
 
				+@code{starpu_unpack_cl_args} must be called within the codelet
			
 
				+implementation to retrieve them.
			
 
				+@end deftypefun
			
 
				+
			
 
				+Here the implementation of the codelet:
			
 
				+
			
 
				+@smallexample
			
 
				+void func_cpu(void *descr[], void *_args)
			
 
				+@{
			
 
				+        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+        int ifactor;
			
 
				+        float ffactor;
			
 
				+
			
 
				+        starpu_unpack_cl_args(_args, &ifactor, &ffactor);
			
 
				+        *x0 = *x0 * ifactor;
			
 
				+        *x1 = *x1 * ffactor;
			
 
				+@}
			
 
				+
			
 
				+starpu_codelet mycodelet = @{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_func = func_cpu,
			
 
				+        .nbuffers = 2
			
 
				+@};
			
 
				+@end smallexample
			
 
				+
			
 
				+And the call to the @code{starpu_insert_task} wrapper:
			
 
				+
			
 
				+@smallexample
			
 
				+starpu_insert_task(&mycodelet,
			
 
				+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				+                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
			
 
				+                   0);
			
 
				+@end smallexample
			
 
				+
			
 
				+The call to @code{starpu_insert_task} is equivalent to the following
			
 
				+code:
			
 
				+
			
 
				+@smallexample
			
 
				+struct starpu_task *task = starpu_task_create();
			
 
				+task->cl = &mycodelet;
			
 
				+task->buffers[0].handle = data_handles[0];
			
 
				+task->buffers[0].mode = STARPU_RW;
			
 
				+task->buffers[1].handle = data_handles[1];
			
 
				+task->buffers[1].mode = STARPU_RW;
			
 
				+char *arg_buffer;
			
 
				+size_t arg_buffer_size;
			
 
				+starpu_pack_cl_args(&arg_buffer, &arg_buffer_size,
			
 
				+		    STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				+		    STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				+		    0);
			
 
				+task->cl_arg = arg_buffer;
			
 
				+task->cl_arg_size = arg_buffer_size;
			
 
				+int ret = starpu_task_submit(task);
			
 
				+@end smallexample
			
 
				+
			
 
				+If some part of the task insertion depends on the value of some computation,
			
 
				+the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
			
 
				+instance, assuming that the index variable @code{i} was registered as handle
			
 
				+@code{i_handle}:
			
 
				+
			
 
				+@smallexample
			
 
				+/* Compute which portion we will work on, e.g. pivot */
			
 
				+starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
			
 
				+
			
 
				+/* And submit the corresponding task */
			
 
				+STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R, starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
			
 
				+@end smallexample
			
 
				+
			
 
				+The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
			
 
				+acquiring data @code{i} for the main application, and will execute the code
			
 
				+given as third parameter when it is acquired. In other words, as soon as the
			
 
				+value of @code{i} computed by the @code{which_index} codelet can be read, the
			
 
				+portion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} will
			
 
				+be executed, and is allowed to read from @code{i} to use it e.g. as an
			
 
				+index. Note that this macro is only avaible when compiling StarPU with
			
 
				+the compiler @code{gcc}.
			
 
				+
			
 
				+@node Debugging
			
 
				+@section Debugging
			
 
				+
			
 
				+StarPU provides several tools to help debugging aplications. Execution traces
			
 
				+can be generated and displayed graphically, see @ref{Generating traces}. Some
			
 
				+gdb helpers are also provided to show the whole StarPU state:
			
 
				+
			
 
				+@smallexample
			
 
				+(gdb) source tools/gdbinit
			
 
				+(gdb) help starpu
			
 
				+@end smallexample
			
 
				+
			
 
				+@node More examples
			
 
				+@section More examples
			
 
				+
			
 
				+More examples are available in the StarPU sources in the @code{examples/}
			
 
				+directory. Simple examples include:
			
 
				+
			
 
				+@table @asis
			
 
				+@item @code{incrementer/}:
			
 
				+	Trivial incrementation test.
			
 
				+@item @code{basic_examples/}:
			
 
				+        Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
			
 
				+        in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
			
 
				+        product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
			
 
				+        interface, an example using the variable data interface, and an example
			
 
				+        using different formats on CPUs and GPUs.
			
 
				+@item @code{matvecmult/}:
			
 
				+	OpenCL example from NVidia, adapted to StarPU.
			
 
				+@item @code{axpy/}:
			
 
				+	AXPY CUBLAS operation adapted to StarPU.
			
 
				+@item @code{fortran/}:
			
 
				+	Example of Fortran bindings.
			
 
				+@end table
			
 
				+
			
 
				+More advanced examples include:
			
 
				+
			
 
				+@table @asis
			
 
				+@item @code{filters/}:
			
 
				+	Examples using filters, as shown in @ref{Partitioning Data}.
			
 
				+@item @code{lu/}:
			
 
				+	LU matrix factorization, see for instance @code{xlu_implicit.c}
			
 
				+@item @code{cholesky/}:
			
 
				+	Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
			
 
				+@end table
			
--- a/doc/basic-api.texi
+++ b/doc/basic-api.texi
--- a/doc/basic-examples.texi
+++ b/doc/basic-examples.texi
@@ -0,0 +1,632 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Basic Examples
			
 
				+@chapter Basic Examples
			
 
				+
			
 
				+@menu
			
 
				+* Compiling and linking options::  
			
 
				+* Hello World::                 Submitting Tasks
			
 
				+* Scaling a Vector::            Manipulating Data
			
 
				+* Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
			
 
				+@end menu
			
 
				+
			
 
				+@node Compiling and linking options
			
 
				+@section Compiling and linking options
			
 
				+
			
 
				+Let's suppose StarPU has been installed in the directory
			
 
				+@code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},
			
 
				+the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
			
 
				+necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
			
 
				+libraries at runtime.
			
 
				+
			
 
				+@example
			
 
				+% PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+% LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
			
 
				+@end example
			
 
				+
			
 
				+The Makefile could for instance contain the following lines to define which
			
 
				+options must be given to the compiler and to the linker:
			
 
				+
			
 
				+@cartouche
			
 
				+@example
			
 
				+CFLAGS          +=      $$(pkg-config --cflags libstarpu)
			
 
				+LDFLAGS         +=      $$(pkg-config --libs libstarpu)
			
 
				+@end example
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Hello World
			
 
				+@section Hello World
			
 
				+
			
 
				+@menu
			
 
				+* Required Headers::            
			
 
				+* Defining a Codelet::          
			
 
				+* Submitting a Task::           
			
 
				+* Execution of Hello World::    
			
 
				+@end menu
			
 
				+
			
 
				+In this section, we show how to implement a simple program that submits a task to StarPU.
			
 
				+
			
 
				+@node Required Headers
			
 
				+@subsection Required Headers
			
 
				+
			
 
				+The @code{starpu.h} header should be included in any code using StarPU.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+
			
 
				+@node Defining a Codelet
			
 
				+@subsection Defining a Codelet
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+struct params @{
			
 
				+    int i;
			
 
				+    float f;
			
 
				+@};
			
 
				+void cpu_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    struct params *params = cl_arg;
			
 
				+
			
 
				+    printf("Hello world (params = @{%i, %f@} )\n", params->i, params->f);
			
 
				+@}
			
 
				+
			
 
				+starpu_codelet cl =
			
 
				+@{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_func = cpu_func,
			
 
				+    .nbuffers = 0
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+A codelet is a structure that represents a computational kernel. Such a codelet
			
 
				+may contain an implementation of the same kernel on different architectures
			
 
				+(e.g. CUDA, Cell's SPU, x86, ...).
			
 
				+
			
 
				+The @code{nbuffers} field specifies the number of data buffers that are
			
 
				+manipulated by the codelet: here the codelet does not access or modify any data
			
 
				+that is controlled by our data management library. Note that the argument
			
 
				+passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
			
 
				+structure) does not count as a buffer since it is not managed by our data
			
 
				+management library, but just contain trivial parameters.
			
 
				+
			
 
				+@c TODO need a crossref to the proper description of "where" see bla for more ...
			
 
				+We create a codelet which may only be executed on the CPUs. The @code{where}
			
 
				+field is a bitmask that defines where the codelet may be executed. Here, the
			
 
				+@code{STARPU_CPU} value means that only CPUs can execute this codelet
			
 
				+(@pxref{Codelets and Tasks} for more details on this field).
			
 
				+When a CPU core executes a codelet, it calls the @code{cpu_func} function,
			
 
				+which @emph{must} have the following prototype:
			
 
				+
			
 
				+@code{void (*cpu_func)(void *buffers[], void *cl_arg);}
			
 
				+
			
 
				+In this example, we can ignore the first argument of this function which gives a
			
 
				+description of the input and output buffers (e.g. the size and the location of
			
 
				+the matrices) since there is none.
			
 
				+The second argument is a pointer to a buffer passed as an
			
 
				+argument to the codelet by the means of the @code{cl_arg} field of the
			
 
				+@code{starpu_task} structure.
			
 
				+
			
 
				+@c TODO rewrite so that it is a little clearer ?
			
 
				+Be aware that this may be a pointer to a
			
 
				+@emph{copy} of the actual buffer, and not the pointer given by the programmer:
			
 
				+if the codelet modifies this buffer, there is no guarantee that the initial
			
 
				+buffer will be modified as well: this for instance implies that the buffer
			
 
				+cannot be used as a synchronization medium. If synchronization is needed, data
			
 
				+has to be registered to StarPU, see @ref{Scaling a Vector}.
			
 
				+
			
 
				+@node Submitting a Task
			
 
				+@subsection Submitting a Task
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void callback_func(void *callback_arg)
			
 
				+@{
			
 
				+    printf("Callback function (arg %x)\n", callback_arg);
			
 
				+@}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+@{
			
 
				+    /* @b{initialize StarPU} */
			
 
				+    starpu_init(NULL);
			
 
				+
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+    task->cl = &cl; /* @b{Pointer to the codelet defined above} */
			
 
				+
			
 
				+    struct params params = @{ 1, 2.0f @};
			
 
				+    task->cl_arg = &params;
			
 
				+    task->cl_arg_size = sizeof(params);
			
 
				+
			
 
				+    task->callback_func = callback_func;
			
 
				+    task->callback_arg = 0x42;
			
 
				+
			
 
				+    /* @b{starpu_task_submit will be a blocking call} */
			
 
				+    task->synchronous = 1;
			
 
				+
			
 
				+    /* @b{submit the task to StarPU} */
			
 
				+    starpu_task_submit(task);
			
 
				+
			
 
				+    /* @b{terminate StarPU} */
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    return 0;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
			
 
				+@code{NULL} argument specifies that we use default configuration. Tasks cannot
			
 
				+be submitted after the termination of StarPU by a call to
			
 
				+@code{starpu_shutdown}.
			
 
				+
			
 
				+In the example above, a task structure is allocated by a call to
			
 
				+@code{starpu_task_create}. This function only allocates and fills the
			
 
				+corresponding structure with the default settings (@pxref{Codelets and
			
 
				+Tasks, starpu_task_create}), but it does not submit the task to StarPU.
			
 
				+
			
 
				+@c not really clear ;)
			
 
				+The @code{cl} field is a pointer to the codelet which the task will
			
 
				+execute: in other words, the codelet structure describes which computational
			
 
				+kernel should be offloaded on the different architectures, and the task
			
 
				+structure is a wrapper containing a codelet and the piece of data on which the
			
 
				+codelet should operate.
			
 
				+
			
 
				+The optional @code{cl_arg} field is a pointer to a buffer (of size
			
 
				+@code{cl_arg_size}) with some parameters for the kernel
			
 
				+described by the codelet. For instance, if a codelet implements a computational
			
 
				+kernel that multiplies its input vector by a constant, the constant could be
			
 
				+specified by the means of this buffer, instead of registering it as a StarPU
			
 
				+data. It must however be noted that StarPU avoids making copy whenever possible
			
 
				+and rather passes the pointer as such, so the buffer which is pointed at must
			
 
				+kept allocated until the task terminates, and if several tasks are submitted
			
 
				+with various parameters, each of them must be given a pointer to their own
			
 
				+buffer.
			
 
				+
			
 
				+Once a task has been executed, an optional callback function is be called.
			
 
				+While the computational kernel could be offloaded on various architectures, the
			
 
				+callback function is always executed on a CPU. The @code{callback_arg}
			
 
				+pointer is passed as an argument of the callback. The prototype of a callback
			
 
				+function must be:
			
 
				+
			
 
				+@code{void (*callback_function)(void *);}
			
 
				+
			
 
				+If the @code{synchronous} field is non-zero, task submission will be
			
 
				+synchronous: the @code{starpu_task_submit} function will not return until the
			
 
				+task was executed. Note that the @code{starpu_shutdown} method does not
			
 
				+guarantee that asynchronous tasks have been executed before it returns,
			
 
				+@code{starpu_task_wait_for_all} can be used to that effect, or data can be
			
 
				+unregistered (@code{starpu_data_unregister(vector_handle);}), which will
			
 
				+implicitly wait for all the tasks scheduled to work on it, unless explicitly
			
 
				+disabled thanks to @code{starpu_data_set_default_sequential_consistency_flag} or
			
 
				+@code{starpu_data_set_sequential_consistency_flag}.
			
 
				+
			
 
				+@node Execution of Hello World
			
 
				+@subsection Execution of Hello World
			
 
				+
			
 
				+@smallexample
			
 
				+% make hello_world
			
 
				+cc $(pkg-config --cflags libstarpu)  $(pkg-config --libs libstarpu) hello_world.c -o hello_world
			
 
				+% ./hello_world
			
 
				+Hello world (params = @{1, 2.000000@} )
			
 
				+Callback function (arg 42)
			
 
				+@end smallexample
			
 
				+
			
 
				+@node Scaling a Vector
			
 
				+@section Manipulating Data: Scaling a Vector
			
 
				+
			
 
				+The previous example has shown how to submit tasks. In this section,
			
 
				+we show how StarPU tasks can manipulate data. The full source code for
			
 
				+this example is given in @ref{Full source code for the 'Scaling a Vector' example}.
			
 
				+
			
 
				+@menu
			
 
				+* Source code of Vector Scaling::  
			
 
				+* Execution of Vector Scaling::  
			
 
				+@end menu
			
 
				+
			
 
				+@node Source code of Vector Scaling
			
 
				+@subsection Source code of Vector Scaling
			
 
				+
			
 
				+Programmers can describe the data layout of their application so that StarPU is
			
 
				+responsible for enforcing data coherency and availability across the machine.
			
 
				+Instead of handling complex (and non-portable) mechanisms to perform data
			
 
				+movements, programmers only declare which piece of data is accessed and/or
			
 
				+modified by a task, and StarPU makes sure that when a computational kernel
			
 
				+starts somewhere (e.g. on a GPU), its data are available locally.
			
 
				+
			
 
				+Before submitting those tasks, the programmer first needs to declare the
			
 
				+different pieces of data to StarPU using the @code{starpu_*_data_register}
			
 
				+functions. To ease the development of applications for StarPU, it is possible
			
 
				+to describe multiple types of data layout. A type of data layout is called an
			
 
				+@b{interface}. There are different predefined interfaces available in StarPU:
			
 
				+here we will consider the @b{vector interface}.
			
 
				+
			
 
				+The following lines show how to declare an array of @code{NX} elements of type
			
 
				+@code{float} using the vector interface:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+float vector[NX];
			
 
				+
			
 
				+starpu_data_handle vector_handle;
			
 
				+starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
			
 
				+                            sizeof(vector[0]));
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The first argument, called the @b{data handle}, is an opaque pointer which
			
 
				+designates the array in StarPU. This is also the structure which is used to
			
 
				+describe which data is used by a task. The second argument is the node number
			
 
				+where the data originally resides. Here it is 0 since the @code{vector} array is in
			
 
				+the main memory. Then comes the pointer @code{vector} where the data can be found in main memory,
			
 
				+the number of elements in the vector and the size of each element.
			
 
				+The following shows how to construct a StarPU task that will manipulate the
			
 
				+vector and a constant factor.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+float factor = 3.14;
			
 
				+struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+task->cl = &cl;                          /* @b{Pointer to the codelet defined below} */
			
 
				+task->buffers[0].handle = vector_handle; /* @b{First parameter of the codelet} */
			
 
				+task->buffers[0].mode = STARPU_RW;
			
 
				+task->cl_arg = &factor;
			
 
				+task->cl_arg_size = sizeof(factor);
			
 
				+task->synchronous = 1;
			
 
				+
			
 
				+starpu_task_submit(task);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Since the factor is a mere constant float value parameter,
			
 
				+it does not need a preliminary registration, and
			
 
				+can just be passed through the @code{cl_arg} pointer like in the previous
			
 
				+example.  The vector parameter is described by its handle.
			
 
				+There are two fields in each element of the @code{buffers} array.
			
 
				+@code{handle} is the handle of the data, and @code{mode} specifies how the
			
 
				+kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
			
 
				+write-only and @code{STARPU_RW} for read and write access).
			
 
				+
			
 
				+The definition of the codelet can be written as follows:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    unsigned i;
			
 
				+    float *factor = cl_arg;
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    /* CPU copy of the vector pointer */
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+
			
 
				+    for (i = 0; i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+@}
			
 
				+
			
 
				+starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_func = scal_cpu_func,
			
 
				+    .nbuffers = 1
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The first argument is an array that gives
			
 
				+a description of all the buffers passed in the @code{task->buffers}@ array. The
			
 
				+size of this array is given by the @code{nbuffers} field of the codelet
			
 
				+structure. For the sake of genericity, this array contains pointers to the
			
 
				+different interfaces describing each buffer.  In the case of the @b{vector
			
 
				+interface}, the location of the vector (resp. its length) is accessible in the
			
 
				+@code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
			
 
				+read-write fashion, any modification will automatically affect future accesses
			
 
				+to this vector made by other tasks.
			
 
				+
			
 
				+The second argument of the @code{scal_cpu_func} function contains a pointer to the
			
 
				+parameters of the codelet (given in @code{task->cl_arg}), so that we read the
			
 
				+constant factor from this pointer.
			
 
				+
			
 
				+@node Execution of Vector Scaling
			
 
				+@subsection Execution of Vector Scaling
			
 
				+
			
 
				+@smallexample
			
 
				+% make vector_scal
			
 
				+cc $(pkg-config --cflags libstarpu)  $(pkg-config --libs libstarpu)  vector_scal.c   -o vector_scal
			
 
				+% ./vector_scal
			
 
				+0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				+@end smallexample
			
 
				+
			
 
				+@node Vector Scaling on an Hybrid CPU/GPU Machine
			
 
				+@section Vector Scaling on an Hybrid CPU/GPU Machine
			
 
				+
			
 
				+Contrary to the previous examples, the task submitted in this example may not
			
 
				+only be executed by the CPUs, but also by a CUDA device.
			
 
				+
			
 
				+@menu
			
 
				+* Definition of the CUDA Kernel::  
			
 
				+* Definition of the OpenCL Kernel::  
			
 
				+* Definition of the Main Code::  
			
 
				+* Execution of Hybrid Vector Scaling::  
			
 
				+@end menu
			
 
				+
			
 
				+@node Definition of the CUDA Kernel
			
 
				+@subsection Definition of the CUDA Kernel
			
 
				+
			
 
				+The CUDA implementation can be written as follows. It needs to be compiled with
			
 
				+a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
			
 
				+that the vector pointer returned by STARPU_VECTOR_GET_PTR is here a pointer in GPU
			
 
				+memory, so that it can be passed as such to the @code{vector_mult_cuda} kernel
			
 
				+call.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+
			
 
				+static __global__ void vector_mult_cuda(float *val, unsigned n,
			
 
				+                                        float factor)
			
 
				+@{
			
 
				+    unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+    if (i < n)
			
 
				+        val[i] *= factor;
			
 
				+@}
			
 
				+
			
 
				+extern "C" void scal_cuda_func(void *buffers[], void *_args)
			
 
				+@{
			
 
				+    float *factor = (float *)_args;
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    /* CUDA copy of the vector pointer */
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    unsigned threads_per_block = 64;
			
 
				+    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+
			
 
				+@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
			
 
				+
			
 
				+@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Definition of the OpenCL Kernel
			
 
				+@subsection Definition of the OpenCL Kernel
			
 
				+
			
 
				+The OpenCL implementation can be written as follows. StarPU provides
			
 
				+tools to compile a OpenCL kernel stored in a file.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
			
 
				+@{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i < nx) @{
			
 
				+                val[i] *= factor;
			
 
				+        @}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Similarly to CUDA, the pointer returned by @code{STARPU_VECTOR_GET_PTR} is here
			
 
				+a device pointer, so that it is passed as such to the OpenCL kernel.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+@i{#include <starpu_opencl.h>}
			
 
				+
			
 
				+@i{extern struct starpu_opencl_program programs;}
			
 
				+
			
 
				+void scal_opencl_func(void *buffers[], void *_args)
			
 
				+@{
			
 
				+    float *factor = _args;
			
 
				+@i{    int id, devid, err;}
			
 
				+@i{    cl_kernel kernel;}
			
 
				+@i{    cl_command_queue queue;}
			
 
				+@i{    cl_event event;}
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    /* OpenCL copy of the vector pointer */
			
 
				+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+
			
 
				+@i{    id = starpu_worker_get_id();}
			
 
				+@i{    devid = starpu_worker_get_devid(id);}
			
 
				+
			
 
				+@i{    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,}
			
 
				+@i{                    "vector_mult_opencl", devid);   /* @b{Name of the codelet defined above} */}
			
 
				+@i{    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				+
			
 
				+@i{    err = clSetKernelArg(kernel, 0, sizeof(val), &val);}
			
 
				+@i{    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);}
			
 
				+@i{    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);}
			
 
				+@i{    if (err) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				+
			
 
				+@i{    @{}
			
 
				+@i{        size_t global=1;}
			
 
				+@i{        size_t local=1;}
			
 
				+@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);}
			
 
				+@i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				+@i{    @}}
			
 
				+
			
 
				+@i{    clFinish(queue);}
			
 
				+@i{    starpu_opencl_collect_stats(event);}
			
 
				+@i{    clReleaseEvent(event);}
			
 
				+
			
 
				+@i{    starpu_opencl_release_kernel(kernel);}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+
			
 
				+@node Definition of the Main Code
			
 
				+@subsection Definition of the Main Code
			
 
				+
			
 
				+The CPU implementation is the same as in the previous section.
			
 
				+
			
 
				+Here is the source of the main application. You can notice the value of the
			
 
				+field @code{where} for the codelet. We specify
			
 
				+@code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codelet
			
 
				+can be executed either on a CPU or on a CUDA or an OpenCL device.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define NX 2048
			
 
				+
			
 
				+extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				+extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				+extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				+
			
 
				+/* @b{Definition of the codelet} */
			
 
				+static starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL; /* @b{It can be executed on a CPU,} */
			
 
				+                                     /* @b{on a CUDA device, or on an OpenCL device} */
			
 
				+    .cuda_func = scal_cuda_func,
			
 
				+    .cpu_func = scal_cpu_func,
			
 
				+    .opencl_func = scal_opencl_func,
			
 
				+    .nbuffers = 1
			
 
				+@}
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+/* @b{The compiled version of the OpenCL program} */
			
 
				+struct starpu_opencl_program programs;
			
 
				+#endif
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+@{
			
 
				+    float *vector;
			
 
				+    int i, ret;
			
 
				+    float factor=3.0;
			
 
				+    struct starpu_task *task;
			
 
				+    starpu_data_handle vector_handle;
			
 
				+
			
 
				+    starpu_init(NULL);                            /* @b{Initialising StarPU} */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    starpu_opencl_load_opencl_from_file(
			
 
				+            "examples/basic_examples/vector_scal_opencl_codelet.cl",
			
 
				+            &programs, NULL);
			
 
				+#endif
			
 
				+
			
 
				+    vector = malloc(NX*sizeof(vector[0]));
			
 
				+    assert(vector);
			
 
				+    for(i=0 ; i<NX ; i++) vector[i] = i;
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    /* @b{Registering data within StarPU} */
			
 
				+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
			
 
				+                                NX, sizeof(vector[0]));
			
 
				+
			
 
				+    /* @b{Definition of the task} */
			
 
				+    task = starpu_task_create();
			
 
				+    task->cl = &cl;
			
 
				+    task->buffers[0].handle = vector_handle;
			
 
				+    task->buffers[0].mode = STARPU_RW;
			
 
				+    task->cl_arg = &factor;
			
 
				+    task->cl_arg_size = sizeof(factor);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    /* @b{Submitting the task} */
			
 
				+    ret = starpu_task_submit(task);
			
 
				+    if (ret == -ENODEV) @{
			
 
				+            fprintf(stderr, "No worker may execute this task\n");
			
 
				+            return 1;
			
 
				+    @}
			
 
				+
			
 
				+@c TODO: Mmm, should rather be an unregistration with an implicit dependency, no?
			
 
				+    /* @b{Waiting for its termination} */
			
 
				+    starpu_task_wait_for_all();
			
 
				+
			
 
				+    /* @b{Update the vector in RAM} */
			
 
				+    starpu_data_acquire(vector_handle, STARPU_R);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    /* @b{Access the data} */
			
 
				+    for(i=0 ; i<NX; i++) @{
			
 
				+      fprintf(stderr, "%f ", vector[i]);
			
 
				+    @}
			
 
				+    fprintf(stderr, "\n");
			
 
				+
			
 
				+    /* @b{Release the RAM view of the data before unregistering it and shutting down StarPU} */
			
 
				+    starpu_data_release(vector_handle);
			
 
				+    starpu_data_unregister(vector_handle);
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    return 0;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Execution of Hybrid Vector Scaling
			
 
				+@subsection Execution of Hybrid Vector Scaling
			
 
				+
			
 
				+The Makefile given at the beginning of the section must be extended to
			
 
				+give the rules to compile the CUDA source code. Note that the source
			
 
				+file of the OpenCL kernel does not need to be compiled now, it will
			
 
				+be compiled at run-time when calling the function
			
 
				+@code{starpu_opencl_load_opencl_from_file()} (@pxref{starpu_opencl_load_opencl_from_file}).
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+CFLAGS	+=	$(shell pkg-config --cflags libstarpu)
			
 
				+LDFLAGS	+=	$(shell pkg-config --libs libstarpu)
			
 
				+CC	=	gcc
			
 
				+
			
 
				+vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
			
 
				+
			
 
				+%.o: %.cu
			
 
				+       nvcc $(CFLAGS) $< -c $@
			
 
				+
			
 
				+clean:
			
 
				+       rm -f vector_scal *.o
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@smallexample
			
 
				+% make
			
 
				+@end smallexample
			
 
				+
			
 
				+and to execute it, with the default configuration:
			
 
				+
			
 
				+@smallexample
			
 
				+% ./vector_scal
			
 
				+0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				+@end smallexample
			
 
				+
			
 
				+or for example, by disabling CPU devices:
			
 
				+
			
 
				+@smallexample
			
 
				+% STARPU_NCPUS=0 ./vector_scal
			
 
				+0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				+@end smallexample
			
 
				+
			
 
				+or by disabling CUDA devices (which may permit to enable the use of OpenCL,
			
 
				+see @ref{Enabling OpenCL}):
			
 
				+
			
 
				+@smallexample
			
 
				+% STARPU_NCUDA=0 ./vector_scal
			
 
				+0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				+@end smallexample
			
--- a/doc/configuration.texi
+++ b/doc/configuration.texi
@@ -0,0 +1,615 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Configuring StarPU
			
 
				+@chapter Configuring StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Compilation configuration::   
			
 
				+* Execution configuration through environment variables::  
			
 
				+@end menu
			
 
				+
			
 
				+@node Compilation configuration
			
 
				+@section Compilation configuration
			
 
				+
			
 
				+The following arguments can be given to the @code{configure} script.
			
 
				+
			
 
				+@menu
			
 
				+* Common configuration::        
			
 
				+* Configuring workers::         
			
 
				+* Advanced configuration::      
			
 
				+@end menu
			
 
				+
			
 
				+@node Common configuration
			
 
				+@subsection Common configuration
			
 
				+
			
 
				+
			
 
				+@menu
			
 
				+* --enable-debug::              
			
 
				+* --enable-fast::               
			
 
				+* --enable-verbose::            
			
 
				+* --enable-coverage::           
			
 
				+@end menu
			
 
				+
			
 
				+@node --enable-debug
			
 
				+@subsubsection @code{--enable-debug}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Enable debugging messages.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-fast
			
 
				+@subsubsection @code{--enable-fast}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Do not enforce assertions, saves a lot of time spent to compute them otherwise.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-verbose
			
 
				+@subsubsection @code{--enable-verbose}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Augment the verbosity of the debugging messages. This can be disabled
			
 
				+at runtime by setting the environment variable @code{STARPU_SILENT} to
			
 
				+any value.
			
 
				+
			
 
				+@smallexample
			
 
				+% STARPU_SILENT=1 ./vector_scal
			
 
				+@end smallexample
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-coverage
			
 
				+@subsubsection @code{--enable-coverage}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Enable flags for the @code{gcov} coverage tool.
			
 
				+@end table
			
 
				+
			
 
				+@node Configuring workers
			
 
				+@subsection Configuring workers
			
 
				+
			
 
				+@menu
			
 
				+* --enable-maxcpus::         
			
 
				+* --disable-cpu::               
			
 
				+* --enable-maxcudadev::         
			
 
				+* --disable-cuda::              
			
 
				+* --with-cuda-dir::             
			
 
				+* --with-cuda-include-dir::             
			
 
				+* --with-cuda-lib-dir::             
			
 
				+* --disable-cuda-memcpy-peer::
			
 
				+* --enable-maxopencldev::       
			
 
				+* --disable-opencl::            
			
 
				+* --with-opencl-dir::           
			
 
				+* --with-opencl-include-dir::           
			
 
				+* --with-opencl-lib-dir::           
			
 
				+* --enable-gordon::             
			
 
				+* --with-gordon-dir::           
			
 
				+* --enable-maximplementations::
			
 
				+@end menu
			
 
				+
			
 
				+@node --enable-maxcpus
			
 
				+@subsubsection @code{--enable-maxcpus=<number>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Defines the maximum number of CPU cores that StarPU will support, then
			
 
				+available as the @code{STARPU_MAXCPUS} macro.
			
 
				+@end table
			
 
				+
			
 
				+@node --disable-cpu
			
 
				+@subsubsection @code{--disable-cpu}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Disable the use of CPUs of the machine. Only GPUs etc. will be used.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-maxcudadev
			
 
				+@subsubsection @code{--enable-maxcudadev=<number>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Defines the maximum number of CUDA devices that StarPU will support, then
			
 
				+available as the @code{STARPU_MAXCUDADEVS} macro.
			
 
				+@end table
			
 
				+
			
 
				+@node --disable-cuda
			
 
				+@subsubsection @code{--disable-cuda}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Disable the use of CUDA, even if a valid CUDA installation was detected.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-cuda-dir
			
 
				+@subsubsection @code{--with-cuda-dir=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the directory where CUDA is installed. This directory should notably contain
			
 
				+@code{include/cuda.h}.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-cuda-include-dir
			
 
				+@subsubsection @code{--with-cuda-include-dir=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the directory where CUDA headers are installed. This directory should
			
 
				+notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
			
 
				+value given to @code{--with-cuda-dir}.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-cuda-lib-dir
			
 
				+@subsubsection @code{--with-cuda-lib-dir=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the directory where the CUDA library is installed. This directory should
			
 
				+notably contain the CUDA shared libraries (e.g. libcuda.so). This defaults to
			
 
				+@code{/lib} appended to the value given to @code{--with-cuda-dir}.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node --disable-cuda-memcpy-peer
			
 
				+@subsubsection @code{--disable-cuda-memcpy-peer}
			
 
				+@table @asis
			
 
				+@item @emph{Description}
			
 
				+Explicitely disables peer transfers when using CUDA 4.0
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-maxopencldev
			
 
				+@subsubsection @code{--enable-maxopencldev=<number>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Defines the maximum number of OpenCL devices that StarPU will support, then
			
 
				+available as the @code{STARPU_MAXOPENCLDEVS} macro.
			
 
				+@end table
			
 
				+
			
 
				+@node --disable-opencl
			
 
				+@subsubsection @code{--disable-opencl}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Disable the use of OpenCL, even if the SDK is detected.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-opencl-dir
			
 
				+@subsubsection @code{--with-opencl-dir=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the location of the OpenCL SDK. This directory should notably contain
			
 
				+@code{include/CL/cl.h} (or @code{include/OpenCL/cl.h} on Mac OS).
			
 
				+@end table
			
 
				+
			
 
				+@node --with-opencl-include-dir
			
 
				+@subsubsection @code{--with-opencl-include-dir=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the location of OpenCL headers. This directory should notably contain
			
 
				+@code{CL/cl.h} (or @code{OpenCL/cl.h} on Mac OS). This defaults to
			
 
				+@code{/include} appended to the value given to @code{--with-opencl-dir}.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node --with-opencl-lib-dir
			
 
				+@subsubsection @code{--with-opencl-lib-dir=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the location of the OpenCL library. This directory should notably
			
 
				+contain the OpenCL shared libraries (e.g. libOpenCL.so). This defaults to
			
 
				+@code{/lib} appended to the value given to @code{--with-opencl-dir}.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-gordon
			
 
				+@subsubsection @code{--enable-gordon}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Enable the use of the Gordon runtime for Cell SPUs.
			
 
				+@c TODO: rather default to enabled when detected
			
 
				+@end table
			
 
				+
			
 
				+@node --with-gordon-dir
			
 
				+@subsubsection @code{--with-gordon-dir=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the location of the Gordon SDK.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-maximplementations
			
 
				+@subsubsection @code{--enable-maximplementations=<number>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Defines the number of implementations that can be defined for a single kind of
			
 
				+device. It is then available as the @code{STARPU_MAXIMPLEMENTATIONS} macro.
			
 
				+@end table
			
 
				+
			
 
				+@node Advanced configuration
			
 
				+@subsection Advanced configuration
			
 
				+
			
 
				+@menu
			
 
				+* --enable-perf-debug::         
			
 
				+* --enable-model-debug::        
			
 
				+* --enable-stats::              
			
 
				+* --enable-maxbuffers::         
			
 
				+* --enable-allocation-cache::   
			
 
				+* --enable-opengl-render::      
			
 
				+* --enable-blas-lib::           
			
 
				+* --with-magma::                
			
 
				+* --with-fxt::                  
			
 
				+* --with-perf-model-dir::       
			
 
				+* --with-mpicc::                
			
 
				+* --with-goto-dir::             
			
 
				+* --with-atlas-dir::            
			
 
				+* --with-mkl-cflags::
			
 
				+* --with-mkl-ldflags::
			
 
				+@end menu
			
 
				+
			
 
				+@node --enable-perf-debug
			
 
				+@subsubsection @code{--enable-perf-debug}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Enable performance debugging through gprof.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-model-debug
			
 
				+@subsubsection @code{--enable-model-debug}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Enable performance model debugging.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-stats
			
 
				+@subsubsection @code{--enable-stats}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Enable statistics.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-maxbuffers
			
 
				+@subsubsection @code{--enable-maxbuffers=<nbuffers>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Define the maximum number of buffers that tasks will be able to take
			
 
				+as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-allocation-cache
			
 
				+@subsubsection @code{--enable-allocation-cache}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Enable the use of a data allocation cache to avoid the cost of it with
			
 
				+CUDA. Still experimental.
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-opengl-render
			
 
				+@subsubsection @code{--enable-opengl-render}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Enable the use of OpenGL for the rendering of some examples.
			
 
				+@c TODO: rather default to enabled when detected
			
 
				+@end table
			
 
				+
			
 
				+@node --enable-blas-lib
			
 
				+@subsubsection @code{--enable-blas-lib=<name>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the blas library to be used by some of the examples. The
			
 
				+library has to be 'atlas' or 'goto'.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-magma
			
 
				+@subsubsection @code{--with-magma=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify where magma is installed. This directory should notably contain
			
 
				+@code{include/magmablas.h}.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-fxt
			
 
				+@subsubsection @code{--with-fxt=<path>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the location of FxT (for generating traces and rendering them
			
 
				+using ViTE). This directory should notably contain
			
 
				+@code{include/fxt/fxt.h}.
			
 
				+@c TODO add ref to other section
			
 
				+@end table
			
 
				+
			
 
				+@node --with-perf-model-dir
			
 
				+@subsubsection @code{--with-perf-model-dir=<dir>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify where performance models should be stored (instead of defaulting to the
			
 
				+current user's home).
			
 
				+@end table
			
 
				+
			
 
				+@node --with-mpicc
			
 
				+@subsubsection @code{--with-mpicc=<path to mpicc>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the location of the @code{mpicc} compiler to be used for starpumpi.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-goto-dir
			
 
				+@subsubsection @code{--with-goto-dir=<dir>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the location of GotoBLAS.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-atlas-dir
			
 
				+@subsubsection @code{--with-atlas-dir=<dir>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the location of ATLAS. This directory should notably contain
			
 
				+@code{include/cblas.h}.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-mkl-cflags
			
 
				+@subsubsection @code{--with-mkl-cflags=<cflags>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the compilation flags for the MKL Library.
			
 
				+@end table
			
 
				+
			
 
				+@node --with-mkl-ldflags
			
 
				+@subsubsection @code{--with-mkl-ldflags=<ldflags>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Specify the linking flags for the MKL Library. Note that the
			
 
				+@url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/}
			
 
				+website provides a script to determine the linking flags.
			
 
				+@end table
			
 
				+
			
 
				+@node Execution configuration through environment variables
			
 
				+@section Execution configuration through environment variables
			
 
				+
			
 
				+@menu
			
 
				+* Workers::                     Configuring workers
			
 
				+* Scheduling::                  Configuring the Scheduling engine
			
 
				+* Misc::                        Miscellaneous and debug
			
 
				+@end menu
			
 
				+
			
 
				+Note: the values given in @code{starpu_conf} structure passed when
			
 
				+calling @code{starpu_init} will override the values of the environment
			
 
				+variables.
			
 
				+
			
 
				+@node Workers
			
 
				+@subsection Configuring workers
			
 
				+
			
 
				+@menu
			
 
				+* STARPU_NCPUS::                Number of CPU workers
			
 
				+* STARPU_NCUDA::                Number of CUDA workers
			
 
				+* STARPU_NOPENCL::              Number of OpenCL workers
			
 
				+* STARPU_NGORDON::              Number of SPU workers (Cell)
			
 
				+* STARPU_WORKERS_CPUID::        Bind workers to specific CPUs
			
 
				+* STARPU_WORKERS_CUDAID::       Select specific CUDA devices
			
 
				+* STARPU_WORKERS_OPENCLID::     Select specific OpenCL devices
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_NCPUS
			
 
				+@subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
			
 
				+more CPU workers than there are physical CPUs, and that some CPUs are used to control
			
 
				+the accelerators.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_NCUDA
			
 
				+@subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+Specify the number of CUDA devices that StarPU can use. If
			
 
				+@code{STARPU_NCUDA} is lower than the number of physical devices, it is
			
 
				+possible to select which CUDA devices should be used by the means of the
			
 
				+@code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
			
 
				+create as many CUDA workers as there are CUDA devices.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_NOPENCL
			
 
				+@subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_NGORDON
			
 
				+@subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+Specify the number of SPUs that StarPU can use.
			
 
				+@end table
			
 
				+
			
 
				+
			
 
				+@node STARPU_WORKERS_CPUID
			
 
				+@subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
			
 
				+specifies on which logical CPU the different workers should be
			
 
				+bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
			
 
				+worker will be bound to logical CPU #0, the second CPU worker will be bound to
			
 
				+logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
			
 
				+determined by the OS, or provided by the @code{hwloc} library in case it is
			
 
				+available.
			
 
				+
			
 
				+Note that the first workers correspond to the CUDA workers, then come the
			
 
				+OpenCL and the SPU, and finally the CPU workers. For example if
			
 
				+we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
			
 
				+and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
			
 
				+by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
			
 
				+the logical CPUs #1 and #3 will be used by the CPU workers.
			
 
				+
			
 
				+If the number of workers is larger than the array given in
			
 
				+@code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
			
 
				+round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
			
 
				+third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
			
 
				+
			
 
				+This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
			
 
				+@code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_WORKERS_CUDAID
			
 
				+@subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
			
 
				+possible to select which CUDA devices should be used by StarPU. On a machine
			
 
				+equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
			
 
				+@code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
			
 
				+they should use CUDA devices #1 and #3 (the logical ordering of the devices is
			
 
				+the one reported by CUDA).
			
 
				+
			
 
				+This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
			
 
				+the @code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_WORKERS_OPENCLID
			
 
				+@subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
			
 
				+
			
 
				+This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
			
 
				+the @code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				+@end table
			
 
				+
			
 
				+@node Scheduling
			
 
				+@subsection Configuring the Scheduling engine
			
 
				+
			
 
				+@menu
			
 
				+* STARPU_SCHED::                Scheduling policy
			
 
				+* STARPU_CALIBRATE::            Calibrate performance models
			
 
				+* STARPU_PREFETCH::             Use data prefetch
			
 
				+* STARPU_SCHED_ALPHA::          Computation factor
			
 
				+* STARPU_SCHED_BETA::           Communication factor
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_SCHED
			
 
				+@subsubsection @code{STARPU_SCHED} -- Scheduling policy
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+
			
 
				+This chooses between the different scheduling policies proposed by StarPU: work
			
 
				+random, stealing, greedy, with performance models, etc.
			
 
				+
			
 
				+Use @code{STARPU_SCHED=help} to get the list of available schedulers.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_CALIBRATE
			
 
				+@subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+If this variable is set to 1, the performance models are calibrated during
			
 
				+the execution. If it is set to 2, the previous values are dropped to restart
			
 
				+calibration from scratch. Setting this variable to 0 disable calibration, this
			
 
				+is the default behaviour.
			
 
				+
			
 
				+Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_PREFETCH
			
 
				+@subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+This variable indicates whether data prefetching should be enabled (0 means
			
 
				+that it is disabled). If prefetching is enabled, when a task is scheduled to be
			
 
				+executed e.g. on a GPU, StarPU will request an asynchronous transfer in
			
 
				+advance, so that data is already present on the GPU when the task starts. As a
			
 
				+result, computation and data transfers are overlapped.
			
 
				+Note that prefetching is enabled by default in StarPU.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_SCHED_ALPHA
			
 
				+@subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+To estimate the cost of a task StarPU takes into account the estimated
			
 
				+computation time (obtained thanks to performance models). The alpha factor is
			
 
				+the coefficient to be applied to it before adding it to the communication part.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_SCHED_BETA
			
 
				+@subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+To estimate the cost of a task StarPU takes into account the estimated
			
 
				+data transfer time (obtained thanks to performance models). The beta factor is
			
 
				+the coefficient to be applied to it before adding it to the computation part.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node Misc
			
 
				+@subsection Miscellaneous and debug
			
 
				+
			
 
				+@menu
			
 
				+* STARPU_SILENT::               Disable verbose mode
			
 
				+* STARPU_LOGFILENAME::          Select debug file name
			
 
				+* STARPU_FXT_PREFIX::           FxT trace location
			
 
				+* STARPU_LIMIT_GPU_MEM::        Restrict memory size on the GPUs
			
 
				+* STARPU_GENERATE_TRACE::       Generate a Paje trace when StarPU is shut down
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_SILENT
			
 
				+@subsubsection @code{STARPU_SILENT} -- Disable verbose mode
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+This variable allows to disable verbose mode at runtime when StarPU
			
 
				+has been configured with the option @code{--enable-verbose}.
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_LOGFILENAME
			
 
				+@subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}:
			
 
				+This variable specifies in which file the debugging output should be saved to.
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_FXT_PREFIX
			
 
				+@subsubsection @code{STARPU_FXT_PREFIX} -- FxT trace location
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}
			
 
				+This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_LIMIT_GPU_MEM
			
 
				+@subsubsection @code{STARPU_LIMIT_GPU_MEM} -- Restrict memory size on the GPUs
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}
			
 
				+This variable specifies the maximum number of megabytes that should be
			
 
				+available to the application on each GPUs. In case this value is smaller than
			
 
				+the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
			
 
				+on the device. This variable is intended to be used for experimental purposes
			
 
				+as it emulates devices that have a limited amount of memory.
			
 
				+@end table
			
 
				+
			
 
				+@node STARPU_GENERATE_TRACE
			
 
				+@subsubsection @code{STARPU_GENERATE_TRACE} -- Generate a Paje trace when StarPU is shut down
			
 
				+@table @asis
			
 
				+
			
 
				+@item @emph{Description}
			
 
				+When set to 1, this variable indicates that StarPU should automatically
			
 
				+generate a Paje trace when starpu_shutdown is called.
			
 
				+@end table
			
--- a/doc/installing.texi
+++ b/doc/installing.texi
@@ -0,0 +1,181 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Installing StarPU
			
 
				+@chapter Installing StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Downloading StarPU::          
			
 
				+* Configuration of StarPU::     
			
 
				+* Building and Installing StarPU::  
			
 
				+@end menu
			
 
				+
			
 
				+StarPU can be built and installed by the standard means of the GNU
			
 
				+autotools. The following chapter is intended to briefly remind how these tools
			
 
				+can be used to install StarPU.
			
 
				+
			
 
				+@node Downloading StarPU
			
 
				+@section Downloading StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Getting Sources::             
			
 
				+* Optional dependencies::       
			
 
				+@end menu
			
 
				+
			
 
				+@node Getting Sources
			
 
				+@subsection Getting Sources
			
 
				+
			
 
				+The simplest way to get StarPU sources is to download the latest official
			
 
				+release tarball from @indicateurl{https://gforge.inria.fr/frs/?group_id=1570} ,
			
 
				+or the latest nightly snapshot from
			
 
				+@indicateurl{http://starpu.gforge.inria.fr/testing/} . The following documents
			
 
				+how to get the very latest version from the subversion repository itself, it
			
 
				+should be needed only if you need the very latest changes (i.e. less than a
			
 
				+day!)
			
 
				+
			
 
				+The source code is managed by a Subversion server hosted by the
			
 
				+InriaGforge. To get the source code, you need:
			
 
				+
			
 
				+@itemize
			
 
				+@item
			
 
				+To install the client side of the software Subversion if it is
			
 
				+not already available on your system. The software can be obtained from
			
 
				+@indicateurl{http://subversion.tigris.org} . If you are running
			
 
				+on Windows, you will probably prefer to use TortoiseSVN from
			
 
				+@indicateurl{http://tortoisesvn.tigris.org/} .
			
 
				+
			
 
				+@item
			
 
				+You can check out the project's SVN repository through anonymous
			
 
				+access. This will provide you with a read access to the
			
 
				+repository.
			
 
				+
			
 
				+If you need to have write access on the StarPU project, you can also choose to
			
 
				+become a member of the project @code{starpu}.  For this, you first need to get
			
 
				+an account to the gForge server. You can then send a request to join the project
			
 
				+(@indicateurl{https://gforge.inria.fr/project/request.php?group_id=1570}).
			
 
				+
			
 
				+@item
			
 
				+More information on how to get a gForge account, to become a member of
			
 
				+a project, or on any other related task can be obtained from the
			
 
				+InriaGforge at @indicateurl{https://gforge.inria.fr/}. The most important
			
 
				+thing is to upload your public SSH key on the gForge server (see the
			
 
				+FAQ at @indicateurl{http://siteadmin.gforge.inria.fr/FAQ.html#Q6} for
			
 
				+instructions).
			
 
				+@end itemize
			
 
				+
			
 
				+You can now check out the latest version from the Subversion server:
			
 
				+@itemize
			
 
				+@item
			
 
				+using the anonymous access via svn:
			
 
				+@example
			
 
				+% svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk
			
 
				+@end example
			
 
				+@item
			
 
				+using the anonymous access via https:
			
 
				+@example
			
 
				+% svn checkout --username anonsvn https://scm.gforge.inria.fr/svn/starpu/trunk
			
 
				+@end example
			
 
				+The password is @code{anonsvn}.
			
 
				+@item
			
 
				+using your gForge account
			
 
				+@example
			
 
				+% svn checkout svn+ssh://<login>@@scm.gforge.inria.fr/svn/starpu/trunk
			
 
				+@end example
			
 
				+@end itemize
			
 
				+
			
 
				+The following step requires the availability of @code{autoconf} and
			
 
				+@code{automake} to generate the @code{./configure} script. This is
			
 
				+done by calling @code{./autogen.sh}. The required version for
			
 
				+@code{autoconf} is 2.60 or higher. You will also need @code{makeinfo}.
			
 
				+
			
 
				+@example
			
 
				+% ./autogen.sh
			
 
				+@end example
			
 
				+
			
 
				+If the autotools are not available on your machine or not recent
			
 
				+enough, you can choose to download the latest nightly tarball, which
			
 
				+is provided with a @code{configure} script.
			
 
				+
			
 
				+@example
			
 
				+% wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
			
 
				+@end example
			
 
				+
			
 
				+@node Optional dependencies
			
 
				+@subsection Optional dependencies
			
 
				+
			
 
				+The topology discovery library, @code{hwloc}, is not mandatory to use StarPU
			
 
				+but strongly recommended. It allows to increase performance, and to
			
 
				+perform some topology aware scheduling.
			
 
				+
			
 
				+@code{hwloc} is available in major distributions and for most OSes and can be
			
 
				+downloaded from @indicateurl{http://www.open-mpi.org/software/hwloc}.
			
 
				+
			
 
				+@node Configuration of StarPU
			
 
				+@section Configuration of StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Generating Makefiles and configuration scripts::  
			
 
				+* Running the configuration::   
			
 
				+@end menu
			
 
				+
			
 
				+@node Generating Makefiles and configuration scripts
			
 
				+@subsection Generating Makefiles and configuration scripts
			
 
				+
			
 
				+This step is not necessary when using the tarball releases of StarPU.  If you
			
 
				+are using the source code from the svn repository, you first need to generate
			
 
				+the configure scripts and the Makefiles.
			
 
				+
			
 
				+@example
			
 
				+% ./autogen.sh
			
 
				+@end example
			
 
				+
			
 
				+@node Running the configuration
			
 
				+@subsection Running the configuration
			
 
				+
			
 
				+@example
			
 
				+% ./configure
			
 
				+@end example
			
 
				+
			
 
				+Details about options that are useful to give to @code{./configure} are given in
			
 
				+@ref{Compilation configuration}.
			
 
				+
			
 
				+@node Building and Installing StarPU
			
 
				+@section Building and Installing StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Building::                    
			
 
				+* Sanity Checks::               
			
 
				+* Installing::                  
			
 
				+@end menu
			
 
				+
			
 
				+@node Building
			
 
				+@subsection Building
			
 
				+
			
 
				+@example
			
 
				+% make
			
 
				+@end example
			
 
				+
			
 
				+@node Sanity Checks
			
 
				+@subsection Sanity Checks
			
 
				+
			
 
				+In order to make sure that StarPU is working properly on the system, it is also
			
 
				+possible to run a test suite.
			
 
				+
			
 
				+@example
			
 
				+% make check
			
 
				+@end example
			
 
				+
			
 
				+@node Installing
			
 
				+@subsection Installing
			
 
				+
			
 
				+In order to install StarPU at the location that was specified during
			
 
				+configuration:
			
 
				+
			
 
				+@example
			
 
				+% make install
			
 
				+@end example
			
--- a/doc/introduction.texi
+++ b/doc/introduction.texi
@@ -0,0 +1,168 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Introduction
			
 
				+@chapter Introduction to StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Motivation::                  Why StarPU ?
			
 
				+* StarPU in a Nutshell::        The Fundamentals of StarPU
			
 
				+@end menu
			
 
				+
			
 
				+@node Motivation
			
 
				+@section Motivation
			
 
				+
			
 
				+@c complex machines with heterogeneous cores/devices
			
 
				+The use of specialized hardware such as accelerators or coprocessors offers an
			
 
				+interesting approach to overcome the physical limits encountered by processor
			
 
				+architects. As a result, many machines are now equipped with one or several
			
 
				+accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
			
 
				+efforts have been devoted to offload computation onto such accelerators, very
			
 
				+little attention as been paid to portability concerns on the one hand, and to the
			
 
				+possibility of having heterogeneous accelerators and processors to interact on the other hand.
			
 
				+
			
 
				+StarPU is a runtime system that offers support for heterogeneous multicore
			
 
				+architectures, it not only offers a unified view of the computational resources
			
 
				+(i.e. CPUs and accelerators at the same time), but it also takes care of
			
 
				+efficiently mapping and executing tasks onto an heterogeneous machine while
			
 
				+transparently handling low-level issues such as data transfers in a portable
			
 
				+fashion.
			
 
				+
			
 
				+@c this leads to a complicated distributed memory design
			
 
				+@c which is not (easily) manageable by hand
			
 
				+
			
 
				+@c added value/benefits of StarPU
			
 
				+@c   - portability
			
 
				+@c   - scheduling, perf. portability
			
 
				+
			
 
				+@node StarPU in a Nutshell
			
 
				+@section StarPU in a Nutshell
			
 
				+
			
 
				+@menu
			
 
				+* Codelet and Tasks::           
			
 
				+* StarPU Data Management Library::  
			
 
				+* Glossary::
			
 
				+* Research Papers::
			
 
				+@end menu
			
 
				+
			
 
				+From a programming point of view, StarPU is not a new language but a library
			
 
				+that executes tasks explicitly submitted by the application.  The data that a
			
 
				+task manipulates are automatically transferred onto the accelerator so that the
			
 
				+programmer does not have to take care of complex data movements.  StarPU also
			
 
				+takes particular care of scheduling those tasks efficiently and allows
			
 
				+scheduling experts to implement custom scheduling policies in a portable
			
 
				+fashion.
			
 
				+
			
 
				+@c explain the notion of codelet and task (i.e. g(A, B)
			
 
				+@node Codelet and Tasks
			
 
				+@subsection Codelet and Tasks
			
 
				+
			
 
				+One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
			
 
				+computational kernel that can possibly be implemented on multiple architectures
			
 
				+such as a CPU, a CUDA device or a Cell's SPU.
			
 
				+
			
 
				+@c TODO insert illustration f : f_spu, f_cpu, ...
			
 
				+
			
 
				+Another important data structure is the @b{task}. Executing a StarPU task
			
 
				+consists in applying a codelet on a data set, on one of the architectures on
			
 
				+which the codelet is implemented. A task thus describes the codelet that it
			
 
				+uses, but also which data are accessed, and how they are
			
 
				+accessed during the computation (read and/or write).
			
 
				+StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
			
 
				+operation. The task structure can also specify a @b{callback} function that is
			
 
				+called once StarPU has properly executed the task. It also contains optional
			
 
				+fields that the application may use to give hints to the scheduler (such as
			
 
				+priority levels).
			
 
				+
			
 
				+By default, task dependencies are inferred from data dependency (sequential
			
 
				+coherence) by StarPU. The application can however disable sequential coherency
			
 
				+for some data, and dependencies be expressed by hand.
			
 
				+A task may be identified by a unique 64-bit number chosen by the application
			
 
				+which we refer as a @b{tag}.
			
 
				+Task dependencies can be enforced by hand either by the means of callback functions, by
			
 
				+submitting other tasks, or by expressing dependencies
			
 
				+between tags (which can thus correspond to tasks that have not been submitted
			
 
				+yet).
			
 
				+
			
 
				+@c TODO insert illustration f(Ar, Brw, Cr) + ..
			
 
				+
			
 
				+@c DSM
			
 
				+@node StarPU Data Management Library
			
 
				+@subsection StarPU Data Management Library
			
 
				+
			
 
				+Because StarPU schedules tasks at runtime, data transfers have to be
			
 
				+done automatically and ``just-in-time'' between processing units,
			
 
				+relieving the application programmer from explicit data transfers.
			
 
				+Moreover, to avoid unnecessary transfers, StarPU keeps data
			
 
				+where it was last needed, even if was modified there, and it
			
 
				+allows multiple copies of the same data to reside at the same time on
			
 
				+several processing units as long as it is not modified.
			
 
				+
			
 
				+@node Glossary
			
 
				+@subsection Glossary
			
 
				+
			
 
				+A @b{codelet} records pointers to various implementations of the same
			
 
				+theoretical function.
			
 
				+
			
 
				+A @b{memory node} can be either the main RAM or GPU-embedded memory.
			
 
				+
			
 
				+A @b{bus} is a link between memory nodes.
			
 
				+
			
 
				+A @b{data handle} keeps track of replicates of the same data (@b{registered} by the
			
 
				+application) over various memory nodes. The data management library manages
			
 
				+keeping them coherent.
			
 
				+
			
 
				+The @b{home} memory node of a data handle is the memory node from which the data
			
 
				+was registered (usually the main memory node).
			
 
				+
			
 
				+A @b{task} represents a scheduled execution of a codelet on some data handles.
			
 
				+
			
 
				+A @b{tag} is a rendez-vous point. Tasks typically have their own tag, and can
			
 
				+depend on other tags. The value is chosen by the application.
			
 
				+
			
 
				+A @b{worker} execute tasks. There is typically one per CPU computation core and
			
 
				+one per accelerator (for which a whole CPU core is dedicated).
			
 
				+
			
 
				+A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
			
 
				+OpenCL and Gordon drivers. They usually start several workers to actually drive
			
 
				+them.
			
 
				+
			
 
				+A @b{performance model} is a (dynamic or static) model of the performance of a
			
 
				+given codelet. Codelets can have execution time performance model as well as
			
 
				+power consumption performance models.
			
 
				+
			
 
				+A data @b{interface} describes the layout of the data: for a vector, a pointer
			
 
				+for the start, the number of elements and the size of elements ; for a matrix, a
			
 
				+pointer for the start, the number of elements per row, the offset between rows,
			
 
				+and the size of each element ; etc. To access their data, codelet functions are
			
 
				+given interfaces for the local memory node replicates of the data handles of the
			
 
				+scheduled task.
			
 
				+
			
 
				+@b{Partitioning} data means dividing the data of a given data handle (called
			
 
				+@b{father}) into a series of @b{children} data handles which designate various
			
 
				+portions of the former.
			
 
				+
			
 
				+A @b{filter} is the function which computes children data handles from a father
			
 
				+data handle, and thus describes how the partitioning should be done (horizontal,
			
 
				+vertical, etc.)
			
 
				+
			
 
				+@b{Acquiring} a data handle can be done from the main application, to safely
			
 
				+access the data of a data handle from its home node, without having to
			
 
				+unregister it.
			
 
				+
			
 
				+
			
 
				+@node Research Papers
			
 
				+@subsection Research Papers
			
 
				+
			
 
				+Research papers about StarPU can be found at
			
 
				+
			
 
				+@indicateurl{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}
			
 
				+
			
 
				+Notably a good overview in the research report
			
 
				+
			
 
				+@indicateurl{http://hal.archives-ouvertes.fr/inria-00467677}
			
--- a/doc/mpi-support.texi
+++ b/doc/mpi-support.texi
@@ -0,0 +1,411 @@
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node StarPU MPI support
			
 
				+@chapter StarPU MPI support
			
 
				+
			
 
				+The integration of MPI transfers within task parallelism is done in a
			
 
				+very natural way by the means of asynchronous interactions between the
			
 
				+application and StarPU.  This is implemented in a separate libstarpumpi library
			
 
				+which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
			
 
				+@code{void *} buffers are replaced with @code{starpu_data_handle}s, and all
			
 
				+GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
			
 
				+use the usual @code{mpirun} command of the MPI implementation to start StarPU on
			
 
				+the different MPI nodes.
			
 
				+
			
 
				+An MPI Insert Task function provides an even more seamless transition to a
			
 
				+distributed application, by automatically issuing all required data transfers
			
 
				+according to the task graph and an application-provided distribution.
			
 
				+
			
 
				+@menu
			
 
				+* The API::                     
			
 
				+* Simple Example::              
			
 
				+* MPI Insert Task Utility::         
			
 
				+* MPI Collective Operations::         
			
 
				+@end menu
			
 
				+
			
 
				+@node The API
			
 
				+@section The API
			
 
				+
			
 
				+@subsection Compilation
			
 
				+
			
 
				+The flags required to compile or link against the MPI layer are then
			
 
				+accessible with the following commands:
			
 
				+
			
 
				+@example
			
 
				+% pkg-config --cflags libstarpumpi  # options for the compiler
			
 
				+% pkg-config --libs libstarpumpi    # options for the linker
			
 
				+@end example
			
 
				+
			
 
				+@subsection Initialisation
			
 
				+
			
 
				+@deftypefun int starpu_mpi_initialize (void)
			
 
				+Initializes the starpumpi library. This must be called between calling
			
 
				+@code{starpu_init} and other @code{starpu_mpi} functions. This
			
 
				+function does not call @code{MPI_Init}, it should be called beforehand.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
			
 
				+Initializes the starpumpi library. This must be called between calling
			
 
				+@code{starpu_init} and other @code{starpu_mpi} functions.
			
 
				+This function calls @code{MPI_Init}, and therefore should be prefered
			
 
				+to the previous one for MPI implementations which are not thread-safe.
			
 
				+Returns the current MPI node rank and world size.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_shutdown (void)
			
 
				+Cleans the starpumpi library. This must be called between calling
			
 
				+@code{starpu_mpi} functions and @code{starpu_shutdown}.
			
 
				+@code{MPI_Finalize} will be called if StarPU-MPI has been initialized
			
 
				+by calling @code{starpu_mpi_initialize_extended}.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@subsection Communication
			
 
				+
			
 
				+@deftypefun int starpu_mpi_send (starpu_data_handle @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_recv (starpu_data_handle @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, MPI_Status *@var{status})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_isend (starpu_data_handle @var{data_handle}, starpu_mpi_req *@var{req}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
			
 
				+
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_irecv (starpu_data_handle @var{data_handle}, starpu_mpi_req *@var{req}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_isend_detached (starpu_data_handle @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_irecv_detached (starpu_data_handle @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_wait (starpu_mpi_req *@var{req}, MPI_Status *@var{status})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_test (starpu_mpi_req *@var{req}, int *@var{flag}, MPI_Status *@var{status})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_barrier (MPI_Comm @var{comm})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag @var{tag})
			
 
				+When the transfer is completed, the tag is unlocked
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag @var{tag})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_isend_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle *@var{data_handle}, int *@var{dest}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag @var{tag})
			
 
				+Asynchronously send an array of buffers, and unlocks the tag once all
			
 
				+of them are transmitted.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_irecv_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle *@var{data_handle}, int *@var{source}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag @var{tag})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@page
			
 
				+@node Simple Example
			
 
				+@section Simple Example
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void increment_token(void)
			
 
				+@{
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+    task->cl = &increment_cl;
			
 
				+    task->buffers[0].handle = token_handle;
			
 
				+    task->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+    starpu_task_submit(task);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+int main(int argc, char **argv)
			
 
				+@{
			
 
				+    int rank, size;
			
 
				+
			
 
				+    starpu_init(NULL);
			
 
				+    starpu_mpi_initialize_extended(&rank, &size);
			
 
				+
			
 
				+    starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
			
 
				+
			
 
				+    unsigned nloops = NITER;
			
 
				+    unsigned loop;
			
 
				+
			
 
				+    unsigned last_loop = nloops - 1;
			
 
				+    unsigned last_rank = size - 1;
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    for (loop = 0; loop < nloops; loop++) @{
			
 
				+        int tag = loop*size + rank;
			
 
				+
			
 
				+        if (loop == 0 && rank == 0)
			
 
				+        @{
			
 
				+            token = 0;
			
 
				+            fprintf(stdout, "Start with token value %d\n", token);
			
 
				+        @}
			
 
				+        else
			
 
				+        @{
			
 
				+            starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag,
			
 
				+                    MPI_COMM_WORLD, NULL, NULL);
			
 
				+        @}
			
 
				+
			
 
				+        increment_token();
			
 
				+
			
 
				+        if (loop == last_loop && rank == last_rank)
			
 
				+        @{
			
 
				+            starpu_data_acquire(token_handle, STARPU_R);
			
 
				+            fprintf(stdout, "Finished : token value %d\n", token);
			
 
				+            starpu_data_release(token_handle);
			
 
				+        @}
			
 
				+        else
			
 
				+        @{
			
 
				+            starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1,
			
 
				+                    MPI_COMM_WORLD, NULL, NULL);
			
 
				+        @}
			
 
				+    @}
			
 
				+
			
 
				+    starpu_task_wait_for_all();
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    starpu_mpi_shutdown();
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    if (rank == last_rank)
			
 
				+    @{
			
 
				+        fprintf(stderr, "[%d] token = %d == %d * %d ?\n", rank, token, nloops, size);
			
 
				+        STARPU_ASSERT(token == nloops*size);
			
 
				+    @}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@page
			
 
				+@node MPI Insert Task Utility
			
 
				+@section MPI Insert Task Utility
			
 
				+
			
 
				+To save the programmer from having to explicit all communications, StarPU
			
 
				+provides an "MPI Insert Task Utility". The principe is that the application
			
 
				+decides a distribution of the data over the MPI nodes by allocating it and
			
 
				+notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns" which
			
 
				+data. All MPI nodes then process the whole task graph, and StarPU automatically
			
 
				+determines which node actually execute which task, as well as the required MPI
			
 
				+transfers.
			
 
				+
			
 
				+@deftypefun int starpu_data_set_rank (starpu_data_handle @var{handle}, int @var{mpi_rank})
			
 
				+Tell StarPU-MPI which MPI node "owns" a given data, that is, the node which will
			
 
				+always keep an up-to-date value, and will by default execute tasks which write
			
 
				+to it.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_data_get_rank (starpu_data_handle @var{handle})
			
 
				+Returns the last value set by @code{starpu_data_set_rank}.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_mpi_insert_task (MPI_Comm @var{comm}, starpu_codelet *@var{cl}, ...)
			
 
				+Create and submit a task corresponding to @var{cl} with the following
			
 
				+arguments.  The argument list must be zero-terminated.
			
 
				+
			
 
				+The arguments following the codelets are the same types as for the
			
 
				+function @code{starpu_insert_task} defined in @ref{Insert Task
			
 
				+Utility}. The extra argument @code{STARPU_EXECUTE_ON_NODE} followed by an
			
 
				+integer allows to specify the MPI node to execute the codelet. It is also
			
 
				+possible to specify that the node owning a specific data will execute
			
 
				+the codelet, by using @code{STARPU_EXECUTE_ON_DATA} followed by a data
			
 
				+handle.
			
 
				+
			
 
				+The internal algorithm is as follows:
			
 
				+@enumerate
			
 
				+@item Find out whether we (as an MPI node) are to execute the codelet
			
 
				+because we own the data to be written to. If different nodes own data
			
 
				+to be written to, the argument @code{STARPU_EXECUTE_ON_NODE} or
			
 
				+@code{STARPU_EXECUTE_ON_DATA} has to be used to specify which MPI node will
			
 
				+execute the task.
			
 
				+@item Send and receive data as requested. Nodes owning data which need to be
			
 
				+read by the task are sending them to the MPI node which will execute it. The
			
 
				+latter receives them.
			
 
				+@item Execute the codelet. This is done by the MPI node selected in the
			
 
				+1st step of the algorithm.
			
 
				+@item In the case when different MPI nodes own data to be written to, send
			
 
				+written data back to their owners.
			
 
				+@end enumerate
			
 
				+
			
 
				+The algorithm also includes a cache mechanism that allows not to send
			
 
				+data twice to the same MPI node, unless the data has been modified.
			
 
				+
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_mpi_get_data_on_node (MPI_Comm @var{comm}, starpu_data_handle @var{data_handle}, int @var{node})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@page
			
 
				+
			
 
				+Here an stencil example showing how to use @code{starpu_mpi_insert_task}. One
			
 
				+first needs to define a distribution function which specifies the
			
 
				+locality of the data. Note that that distribution information needs to
			
 
				+be given to StarPU by calling @code{starpu_data_set_rank}.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Returns the MPI node number where data is */
			
 
				+int my_distrib(int x, int y, int nb_nodes) @{
			
 
				+  /* Block distrib */
			
 
				+  return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
			
 
				+
			
 
				+  // /* Other examples useful for other kinds of computations */
			
 
				+  // /* / distrib */
			
 
				+  // return (x+y) % nb_nodes;
			
 
				+
			
 
				+  // /* Block cyclic distrib */
			
 
				+  // unsigned side = sqrt(nb_nodes);
			
 
				+  // return x % side + (y % side) * size;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Now the data can be registered within StarPU. Data which are not
			
 
				+owned but will be needed for computations can be registered through
			
 
				+the lazy allocation mechanism, i.e. with a @code{home_node} set to -1.
			
 
				+StarPU will automatically allocate the memory when it is used for the
			
 
				+first time.
			
 
				+
			
 
				+One can note an optimization here (the @code{else if} test): we only register
			
 
				+data which will be needed by the tasks that we will execute.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    unsigned matrix[X][Y];
			
 
				+    starpu_data_handle data_handles[X][Y];
			
 
				+
			
 
				+    for(x = 0; x < X; x++) @{
			
 
				+        for (y = 0; y < Y; y++) @{
			
 
				+            int mpi_rank = my_distrib(x, y, size);
			
 
				+             if (mpi_rank == my_rank)
			
 
				+                /* Owning data */
			
 
				+                starpu_variable_data_register(&data_handles[x][y], 0,
			
 
				+                                              (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
			
 
				+            else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				+                  || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				+                /* I don't own that index, but will need it for my computations */
			
 
				+                starpu_variable_data_register(&data_handles[x][y], -1,
			
 
				+                                              (uintptr_t)NULL, sizeof(unsigned));
			
 
				+            else
			
 
				+                /* I know it's useless to allocate anything for this */
			
 
				+                data_handles[x][y] = NULL;
			
 
				+            if (data_handles[x][y])
			
 
				+                starpu_data_set_rank(data_handles[x][y], mpi_rank);
			
 
				+        @}
			
 
				+    @}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Now @code{starpu_mpi_insert_task()} can be called for the different
			
 
				+steps of the application.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    for(loop=0 ; loop<niter; loop++)
			
 
				+        for (x = 1; x < X-1; x++)
			
 
				+            for (y = 1; y < Y-1; y++)
			
 
				+                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl,
			
 
				+                                       STARPU_RW, data_handles[x][y],
			
 
				+                                       STARPU_R, data_handles[x-1][y],
			
 
				+                                       STARPU_R, data_handles[x+1][y],
			
 
				+                                       STARPU_R, data_handles[x][y-1],
			
 
				+                                       STARPU_R, data_handles[x][y+1],
			
 
				+                                       0);
			
 
				+    starpu_task_wait_for_all();
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+I.e. all MPI nodes process the whole task graph, but as mentioned above, for
			
 
				+each task, only the MPI node which owns the data being written to (here,
			
 
				+@code{data_handles[x][y]}) will actually run the task. The other MPI nodes will
			
 
				+automatically send the required data.
			
 
				+
			
 
				+@node MPI Collective Operations
			
 
				+@section MPI Collective Operations
			
 
				+
			
 
				+@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
			
 
				+Scatter data among processes of the communicator based on the ownership of
			
 
				+the data. For each data of the array @var{data_handles}, the
			
 
				+process @var{root} sends the data to the process owning this data.
			
 
				+Processes receiving data must have valid data handles to receive them.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_gather_detached (starpu_data_handle *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
			
 
				+Gather data from the different processes of the communicator onto the
			
 
				+process @var{root}. Each process owning data handle in the array
			
 
				+@var{data_handles} will send them to the process @var{root}. The
			
 
				+process @var{root} must have valid data handles to receive the data.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@page
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+if (rank == root)
			
 
				+@{
			
 
				+    /* Allocate the vector */
			
 
				+    vector = malloc(nblocks * sizeof(float *));
			
 
				+    for(x=0 ; x<nblocks ; x++)
			
 
				+    @{
			
 
				+        starpu_malloc((void **)&vector[x], block_size*sizeof(float));
			
 
				+    @}
			
 
				+@}
			
 
				+
			
 
				+/* Allocate data handles and register data to StarPU */
			
 
				+data_handles = malloc(nblocks*sizeof(starpu_data_handle *));
			
 
				+for(x = 0; x < nblocks ;  x++)
			
 
				+@{
			
 
				+    int mpi_rank = my_distrib(x, nodes);
			
 
				+    if (rank == root) @{
			
 
				+        starpu_vector_data_register(&data_handles[x], 0, (uintptr_t)vector[x],
			
 
				+                                    blocks_size, sizeof(float));
			
 
				+    @}
			
 
				+    else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1))) @{
			
 
				+        /* I own that index, or i will need it for my computations */
			
 
				+        starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
			
 
				+                                   block_size, sizeof(float));
			
 
				+    @}
			
 
				+    else @{
			
 
				+        /* I know it's useless to allocate anything for this */
			
 
				+        data_handles[x] = NULL;
			
 
				+    @}
			
 
				+    if (data_handles[x]) @{
			
 
				+        starpu_data_set_rank(data_handles[x], mpi_rank);
			
 
				+    @}
			
 
				+@}
			
 
				+
			
 
				+/* Scatter the matrix among the nodes */
			
 
				+starpu_mpi_scatter_detached(data_handles, nblocks, root, MPI_COMM_WORLD);
			
 
				+
			
 
				+/* Calculation */
			
 
				+for(x = 0; x < nblocks ;  x++) @{
			
 
				+    if (data_handles[x]) @{
			
 
				+        int owner = starpu_data_get_rank(data_handles[x]);
			
 
				+        if (owner == rank) @{
			
 
				+            starpu_insert_task(&cl, STARPU_RW, data_handles[x], 0);
			
 
				+        @}
			
 
				+    @}
			
 
				+@}
			
 
				+
			
 
				+/* Gather the matrix on main node */
			
 
				+starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+
			
--- a/doc/perf-feedback.texi
+++ b/doc/perf-feedback.texi
@@ -0,0 +1,368 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Performance feedback
			
 
				+@chapter Performance feedback
			
 
				+
			
 
				+@menu
			
 
				+* On-line::       On-line performance feedback
			
 
				+* Off-line::      Off-line performance feedback
			
 
				+* Codelet performance::      Performance of codelets
			
 
				+@end menu
			
 
				+
			
 
				+@node On-line
			
 
				+@section On-line performance feedback
			
 
				+
			
 
				+@menu
			
 
				+* Enabling monitoring::     Enabling on-line performance monitoring
			
 
				+* Task feedback::           Per-task feedback
			
 
				+* Codelet feedback::        Per-codelet feedback
			
 
				+* Worker feedback::         Per-worker feedback
			
 
				+* Bus feedback::            Bus-related feedback
			
 
				+* StarPU-Top::              StarPU-Top interface
			
 
				+@end menu
			
 
				+
			
 
				+@node Enabling monitoring
			
 
				+@subsection Enabling on-line performance monitoring
			
 
				+
			
 
				+In order to enable online performance monitoring, the application can call
			
 
				+@code{starpu_profiling_status_set(STARPU_PROFILING_ENABLE)}. It is possible to
			
 
				+detect whether monitoring is already enabled or not by calling
			
 
				+@code{starpu_profiling_status_get()}. Enabling monitoring also reinitialize all
			
 
				+previously collected feedback. The @code{STARPU_PROFILING} environment variable
			
 
				+can also be set to 1 to achieve the same effect.
			
 
				+
			
 
				+Likewise, performance monitoring is stopped by calling
			
 
				+@code{starpu_profiling_status_set(STARPU_PROFILING_DISABLE)}. Note that this
			
 
				+does not reset the performance counters so that the application may consult
			
 
				+them later on.
			
 
				+
			
 
				+More details about the performance monitoring API are available in section
			
 
				+@ref{Profiling API}.
			
 
				+
			
 
				+@node Task feedback
			
 
				+@subsection Per-task feedback
			
 
				+
			
 
				+If profiling is enabled, a pointer to a @code{starpu_task_profiling_info}
			
 
				+structure is put in the @code{.profiling_info} field of the @code{starpu_task}
			
 
				+structure when a task terminates.
			
 
				+This structure is automatically destroyed when the task structure is destroyed,
			
 
				+either automatically or by calling @code{starpu_task_destroy}.
			
 
				+
			
 
				+The @code{starpu_task_profiling_info} structure indicates the date when the
			
 
				+task was submitted (@code{submit_time}), started (@code{start_time}), and
			
 
				+terminated (@code{end_time}), relative to the initialization of
			
 
				+StarPU with @code{starpu_init}. It also specifies the identifier of the worker
			
 
				+that has executed the task (@code{workerid}).
			
 
				+These date are stored as @code{timespec} structures which the user may convert
			
 
				+into micro-seconds using the @code{starpu_timing_timespec_to_us} helper
			
 
				+function.
			
 
				+
			
 
				+It it worth noting that the application may directly access this structure from
			
 
				+the callback executed at the end of the task. The @code{starpu_task} structure
			
 
				+associated to the callback currently being executed is indeed accessible with
			
 
				+the @code{starpu_get_current_task()} function.
			
 
				+
			
 
				+@node Codelet feedback
			
 
				+@subsection Per-codelet feedback
			
 
				+
			
 
				+The @code{per_worker_stats} field of the @code{starpu_codelet_t} structure is
			
 
				+an array of counters. The i-th entry of the array is incremented every time a
			
 
				+task implementing the codelet is executed on the i-th worker.
			
 
				+This array is not reinitialized when profiling is enabled or disabled.
			
 
				+
			
 
				+@node Worker feedback
			
 
				+@subsection Per-worker feedback
			
 
				+
			
 
				+The second argument returned by the @code{starpu_worker_get_profiling_info}
			
 
				+function is a @code{starpu_worker_profiling_info} structure that gives
			
 
				+statistics about the specified worker. This structure specifies when StarPU
			
 
				+started collecting profiling information for that worker (@code{start_time}),
			
 
				+the duration of the profiling measurement interval (@code{total_time}), the
			
 
				+time spent executing kernels (@code{executing_time}), the time spent sleeping
			
 
				+because there is no task to execute at all (@code{sleeping_time}), and the
			
 
				+number of tasks that were executed while profiling was enabled.
			
 
				+These values give an estimation of the proportion of time spent do real work,
			
 
				+and the time spent either sleeping because there are not enough executable
			
 
				+tasks or simply wasted in pure StarPU overhead. 
			
 
				+
			
 
				+Calling @code{starpu_worker_get_profiling_info} resets the profiling
			
 
				+information associated to a worker.
			
 
				+
			
 
				+When an FxT trace is generated (see @ref{Generating traces}), it is also
			
 
				+possible to use the @code{starpu_top} script (described in @ref{starpu-top}) to
			
 
				+generate a graphic showing the evolution of these values during the time, for
			
 
				+the different workers.
			
 
				+
			
 
				+@node Bus feedback
			
 
				+@subsection Bus-related feedback 
			
 
				+
			
 
				+TODO
			
 
				+
			
 
				+@c how to enable/disable performance monitoring
			
 
				+
			
 
				+@c what kind of information do we get ?
			
 
				+
			
 
				+The bus speed measured by StarPU can be displayed by using the
			
 
				+@code{starpu_machine_display} tool, for instance:
			
 
				+
			
 
				+@example
			
 
				+StarPU has found :
			
 
				+        3 CUDA devices
			
 
				+                CUDA 0 (Tesla C2050 02:00.0)
			
 
				+                CUDA 1 (Tesla C2050 03:00.0)
			
 
				+                CUDA 2 (Tesla C2050 84:00.0)
			
 
				+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
			
 
				+RAM     0.000000        5176.530428     5176.492994     5191.710722
			
 
				+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
			
 
				+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
			
 
				+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
			
 
				+@end example
			
 
				+
			
 
				+@node StarPU-Top
			
 
				+@subsection StarPU-Top interface
			
 
				+
			
 
				+StarPU-Top is an interface which remotely displays the on-line state of a StarPU
			
 
				+application and permits the user to change parameters on the fly.
			
 
				+
			
 
				+Variables to be monitored can be registered by calling the
			
 
				+@code{starputop_add_data_boolean}, @code{starputop_add_data_integer},
			
 
				+@code{starputop_add_data_float} functions, e.g.:
			
 
				+
			
 
				+@example
			
 
				+starputop_data *data = starputop_add_data_integer("mynum", 0, 100, 1);
			
 
				+@end example
			
 
				+
			
 
				+The application should then call @code{starputop_init_and_wait} to give its name
			
 
				+and wait for StarPU-Top to get a start request from the user. The name is used
			
 
				+by StarPU-Top to quickly reload a previously-saved layout of parameter display.
			
 
				+
			
 
				+@example
			
 
				+starputop_init_and_wait("the application");
			
 
				+@end example
			
 
				+
			
 
				+The new values can then be provided thanks to
			
 
				+@code{starputop_update_data_boolean}, @code{starputop_update_data_integer},
			
 
				+@code{starputop_update_data_float}, e.g.:
			
 
				+
			
 
				+@example
			
 
				+starputop_update_data_integer(data, mynum);
			
 
				+@end example
			
 
				+
			
 
				+Updateable parameters can be registered thanks to @code{starputop_register_parameter_boolean}, @code{starputop_register_parameter_integer}, @code{starputop_register_parameter_float}, e.g.:
			
 
				+
			
 
				+@example
			
 
				+float apha;
			
 
				+starputop_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
			
 
				+@end example
			
 
				+
			
 
				+@code{modif_hook} is a function which will be called when the parameter is being modified, it can for instance print the new value:
			
 
				+
			
 
				+@example
			
 
				+void modif_hook(struct starputop_param_t *d) @{
			
 
				+    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
			
 
				+@}
			
 
				+@end example
			
 
				+
			
 
				+Task schedulers should notify StarPU-Top when it has decided when a task will be
			
 
				+scheduled, so that it can show it in its Gantt chart, for instance:
			
 
				+
			
 
				+@example
			
 
				+starputop_task_prevision(task, workerid, begin, end);
			
 
				+@end example
			
 
				+
			
 
				+Starting StarPU-Top and the application can be done two ways:
			
 
				+
			
 
				+@itemize
			
 
				+@item The application is started by hand on some machine (and thus already
			
 
				+waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
			
 
				+checkbox should be unchecked, and the hostname and port (default is 2011) on
			
 
				+which the application is already running should be specified. Clicking on the
			
 
				+connection button will thus connect to the already-running application.
			
 
				+@item StarPU-Top is started first, and clicking on the connection button will
			
 
				+start the application itself (possibly on a remote machine). The SSH checkbox
			
 
				+should be checked, and a command line provided, e.g.:
			
 
				+
			
 
				+@example
			
 
				+ssh myserver STARPU_SCHED=heft ./application
			
 
				+@end example
			
 
				+
			
 
				+If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
			
 
				+
			
 
				+@example
			
 
				+ssh -L 2011:localhost:2011 myserver STARPU_SCHED=heft ./application
			
 
				+@end example
			
 
				+
			
 
				+and "localhost" should be used as IP Address to connect to.
			
 
				+@end itemize
			
 
				+
			
 
				+@node Off-line
			
 
				+@section Off-line performance feedback
			
 
				+
			
 
				+@menu
			
 
				+* Generating traces::       Generating traces with FxT
			
 
				+* Gantt diagram::           Creating a Gantt Diagram
			
 
				+* DAG::                     Creating a DAG with graphviz
			
 
				+* starpu-top::              Monitoring activity
			
 
				+@end menu
			
 
				+
			
 
				+@node Generating traces
			
 
				+@subsection Generating traces with FxT
			
 
				+
			
 
				+StarPU can use the FxT library (see
			
 
				+@indicateurl{https://savannah.nongnu.org/projects/fkt/}) to generate traces
			
 
				+with a limited runtime overhead.
			
 
				+
			
 
				+You can either get a tarball:
			
 
				+@example
			
 
				+% wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.2.tar.gz
			
 
				+@end example
			
 
				+
			
 
				+or use the FxT library from CVS (autotools are required):
			
 
				+@example
			
 
				+% cvs -d :pserver:anonymous@@cvs.sv.gnu.org:/sources/fkt co FxT
			
 
				+% ./bootstrap
			
 
				+@end example
			
 
				+
			
 
				+Compiling and installing the FxT library in the @code{$FXTDIR} path is
			
 
				+done following the standard procedure:
			
 
				+@example
			
 
				+% ./configure --prefix=$FXTDIR
			
 
				+% make
			
 
				+% make install
			
 
				+@end example
			
 
				+
			
 
				+In order to have StarPU to generate traces, StarPU should be configured with
			
 
				+the @code{--with-fxt} option:
			
 
				+@example
			
 
				+$ ./configure --with-fxt=$FXTDIR
			
 
				+@end example
			
 
				+
			
 
				+Or you can simply point the @code{PKG_CONFIG_PATH} to
			
 
				+@code{$FXTDIR/lib/pkgconfig} and pass @code{--with-fxt} to @code{./configure}
			
 
				+
			
 
				+When FxT is enabled, a trace is generated when StarPU is terminated by calling
			
 
				+@code{starpu_shutdown()}). The trace is a binary file whose name has the form
			
 
				+@code{prof_file_XXX_YYY} where @code{XXX} is the user name, and
			
 
				+@code{YYY} is the pid of the process that used StarPU. This file is saved in the
			
 
				+@code{/tmp/} directory by default, or by the directory specified by
			
 
				+the @code{STARPU_FXT_PREFIX} environment variable.
			
 
				+
			
 
				+@node Gantt diagram
			
 
				+@subsection Creating a Gantt Diagram
			
 
				+
			
 
				+When the FxT trace file @code{filename} has been generated, it is possible to
			
 
				+generate a trace in the Paje format by calling:
			
 
				+@example
			
 
				+% starpu_fxt_tool -i filename
			
 
				+@end example
			
 
				+
			
 
				+Or alternatively, setting the @code{STARPU_GENERATE_TRACE} environment variable
			
 
				+to 1 before application execution will make StarPU do it automatically at
			
 
				+application shutdown.
			
 
				+
			
 
				+This will create a @code{paje.trace} file in the current directory that can be
			
 
				+inspected with the ViTE trace visualizing open-source tool. More information
			
 
				+about ViTE is available at @indicateurl{http://vite.gforge.inria.fr/}. It is
			
 
				+possible to open the @code{paje.trace} file with ViTE by using the following
			
 
				+command:
			
 
				+@example
			
 
				+% vite paje.trace
			
 
				+@end example
			
 
				+
			
 
				+@node DAG
			
 
				+@subsection Creating a DAG with graphviz
			
 
				+
			
 
				+When the FxT trace file @code{filename} has been generated, it is possible to
			
 
				+generate a task graph in the DOT format by calling:
			
 
				+@example
			
 
				+$ starpu_fxt_tool -i filename
			
 
				+@end example
			
 
				+
			
 
				+This will create a @code{dag.dot} file in the current directory. This file is a
			
 
				+task graph described using the DOT language. It is possible to get a
			
 
				+graphical output of the graph by using the graphviz library:
			
 
				+@example
			
 
				+$ dot -Tpdf dag.dot -o output.pdf
			
 
				+@end example
			
 
				+
			
 
				+@node starpu-top
			
 
				+@subsection Monitoring activity
			
 
				+
			
 
				+When the FxT trace file @code{filename} has been generated, it is possible to
			
 
				+generate a activity trace by calling:
			
 
				+@example
			
 
				+$ starpu_fxt_tool -i filename
			
 
				+@end example
			
 
				+
			
 
				+This will create an @code{activity.data} file in the current
			
 
				+directory. A profile of the application showing the activity of StarPU
			
 
				+during the execution of the program can be generated:
			
 
				+@example
			
 
				+$ starpu_top activity.data
			
 
				+@end example
			
 
				+
			
 
				+This will create a file named @code{activity.eps} in the current directory.
			
 
				+This picture is composed of two parts.
			
 
				+The first part shows the activity of the different workers. The green sections
			
 
				+indicate which proportion of the time was spent executed kernels on the
			
 
				+processing unit. The red sections indicate the proportion of time spent in
			
 
				+StartPU: an important overhead may indicate that the granularity may be too
			
 
				+low, and that bigger tasks may be appropriate to use the processing unit more
			
 
				+efficiently. The black sections indicate that the processing unit was blocked
			
 
				+because there was no task to process: this may indicate a lack of parallelism
			
 
				+which may be alleviated by creating more tasks when it is possible.
			
 
				+
			
 
				+The second part of the @code{activity.eps} picture is a graph showing the
			
 
				+evolution of the number of tasks available in the system during the execution.
			
 
				+Ready tasks are shown in black, and tasks that are submitted but not
			
 
				+schedulable yet are shown in grey.
			
 
				+
			
 
				+@node Codelet performance
			
 
				+@section Performance of codelets
			
 
				+
			
 
				+The performance model of codelets (described in @ref{Performance model example}) can be examined by using the
			
 
				+@code{starpu_perfmodel_display} tool:
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_display -l
			
 
				+file: <malloc_pinned.hannibal>
			
 
				+file: <starpu_slu_lu_model_21.hannibal>
			
 
				+file: <starpu_slu_lu_model_11.hannibal>
			
 
				+file: <starpu_slu_lu_model_22.hannibal>
			
 
				+file: <starpu_slu_lu_model_12.hannibal>
			
 
				+@end example
			
 
				+
			
 
				+Here, the codelets of the lu example are available. We can examine the
			
 
				+performance of the 22 kernel:
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_display -s starpu_slu_lu_model_22
			
 
				+performance model for cpu
			
 
				+# hash      size       mean          dev           n
			
 
				+57618ab0    19660800   2.851069e+05  1.829369e+04  109
			
 
				+performance model for cuda_0
			
 
				+# hash      size       mean          dev           n
			
 
				+57618ab0    19660800   1.164144e+04  1.556094e+01  315
			
 
				+performance model for cuda_1
			
 
				+# hash      size       mean          dev           n
			
 
				+57618ab0    19660800   1.164271e+04  1.330628e+01  360
			
 
				+performance model for cuda_2
			
 
				+# hash      size       mean          dev           n
			
 
				+57618ab0    19660800   1.166730e+04  3.390395e+02  456
			
 
				+@end example
			
 
				+
			
 
				+We can see that for the given size, over a sample of a few hundreds of
			
 
				+execution, the GPUs are about 20 times faster than the CPUs (numbers are in
			
 
				+us). The standard deviation is extremely low for the GPUs, and less than 10% for
			
 
				+CPUs.
			
 
				+
			
 
				+The @code{starpu_regression_display} tool does the same for regression-based
			
 
				+performance models. It also writes a @code{.gp} file in the current directory,
			
 
				+to be run in the @code{gnuplot} tool, which shows the corresponding curve.
			
 
				+
			
--- a/doc/perf-optimization.texi
+++ b/doc/perf-optimization.texi
@@ -0,0 +1,276 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Performance optimization
			
 
				+@chapter How to optimize performance with StarPU
			
 
				+
			
 
				+TODO: improve!
			
 
				+
			
 
				+@menu
			
 
				+* Data management::
			
 
				+* Task submission::
			
 
				+* Task priorities::
			
 
				+* Task scheduling policy::
			
 
				+* Performance model calibration::
			
 
				+* Task distribution vs Data transfer::
			
 
				+* Data prefetch::
			
 
				+* Power-based scheduling::
			
 
				+* Profiling::
			
 
				+* CUDA-specific optimizations::
			
 
				+@end menu
			
 
				+
			
 
				+Simply encapsulating application kernels into tasks already permits to
			
 
				+seamlessly support CPU and GPUs at the same time. To achieve good performance, a
			
 
				+few additional changes are needed.
			
 
				+
			
 
				+@node Data management
			
 
				+@section Data management
			
 
				+
			
 
				+When the application allocates data, whenever possible it should use the
			
 
				+@code{starpu_malloc} function, which will ask CUDA or
			
 
				+OpenCL to make the allocation itself and pin the corresponding allocated
			
 
				+memory. This is needed to permit asynchronous data transfer, i.e. permit data
			
 
				+transfer to overlap with computations. Otherwise, the trace will show that the
			
 
				+@code{DriverCopyAsync} state takes a lot of time, this is because CUDA or OpenCL
			
 
				+then reverts to synchronous transfers.
			
 
				+
			
 
				+By default, StarPU leaves replicates of data wherever they were used, in case they
			
 
				+will be re-used by other tasks, thus saving the data transfer time. When some
			
 
				+task modifies some data, all the other replicates are invalidated, and only the
			
 
				+processing unit which ran that task will have a valid replicate of the data. If the application knows
			
 
				+that this data will not be re-used by further tasks, it should advise StarPU to
			
 
				+immediately replicate it to a desired list of memory nodes (given through a
			
 
				+bitmask). This can be understood like the write-through mode of CPU caches.
			
 
				+
			
 
				+@example
			
 
				+starpu_data_set_wt_mask(img_handle, 1<<0);
			
 
				+@end example
			
 
				+
			
 
				+will for instance request to always automatically transfer a replicate into the
			
 
				+main memory (node 0), as bit 0 of the write-through bitmask is being set.
			
 
				+
			
 
				+@example
			
 
				+starpu_data_set_wt_mask(img_handle, ~0U);
			
 
				+@end example
			
 
				+
			
 
				+will request to always automatically broadcast the updated data to all memory
			
 
				+nodes.
			
 
				+
			
 
				+@node Task submission
			
 
				+@section Task submission
			
 
				+
			
 
				+To let StarPU make online optimizations, tasks should be submitted
			
 
				+asynchronously as much as possible. Ideally, all the tasks should be
			
 
				+submitted, and mere calls to @code{starpu_task_wait_for_all} or
			
 
				+@code{starpu_data_unregister} be done to wait for
			
 
				+termination. StarPU will then be able to rework the whole schedule, overlap
			
 
				+computation with communication, manage accelerator local memory usage, etc.
			
 
				+
			
 
				+@node Task priorities
			
 
				+@section Task priorities
			
 
				+
			
 
				+By default, StarPU will consider the tasks in the order they are submitted by
			
 
				+the application. If the application programmer knows that some tasks should
			
 
				+be performed in priority (for instance because their output is needed by many
			
 
				+other tasks and may thus be a bottleneck if not executed early enough), the
			
 
				+@code{priority} field of the task structure should be set to transmit the
			
 
				+priority information to StarPU.
			
 
				+
			
 
				+@node Task scheduling policy
			
 
				+@section Task scheduling policy
			
 
				+
			
 
				+By default, StarPU uses the @code{eager} simple greedy scheduler. This is
			
 
				+because it provides correct load balance even if the application codelets do not
			
 
				+have performance models. If your application codelets have performance models
			
 
				+(@pxref{Performance model example} for examples showing how to do it),
			
 
				+you should change the scheduler thanks to the @code{STARPU_SCHED} environment
			
 
				+variable. For instance @code{export STARPU_SCHED=dmda} . Use @code{help} to get
			
 
				+the list of available schedulers.
			
 
				+
			
 
				+The @b{eager} scheduler uses a central task queue, from which workers draw tasks
			
 
				+to work on. This however does not permit to prefetch data since the scheduling
			
 
				+decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
			
 
				+
			
 
				+The @b{prio} scheduler also uses a central task queue, but sorts tasks by
			
 
				+priority (between -5 and 5).
			
 
				+
			
 
				+The @b{random} scheduler distributes tasks randomly according to assumed worker
			
 
				+overall performance.
			
 
				+
			
 
				+The @b{ws} (work stealing) scheduler schedules tasks on the local worker by
			
 
				+default. When a worker becomes idle, it steals a task from the most loaded
			
 
				+worker.
			
 
				+
			
 
				+The @b{dm} (deque model) scheduler uses task execution performance models into account to
			
 
				+perform an HEFT-similar scheduling strategy: it schedules tasks where their
			
 
				+termination time will be minimal.
			
 
				+
			
 
				+The @b{dmda} (deque model data aware) scheduler is similar to dm, it also takes
			
 
				+into account data transfer time.
			
 
				+
			
 
				+The @b{dmdar} (deque model data aware ready) scheduler is similar to dmda,
			
 
				+it also sorts tasks on per-worker queues by number of already-available data
			
 
				+buffers.
			
 
				+
			
 
				+The @b{dmdas} (deque model data aware sorted) scheduler is similar to dmda, it
			
 
				+also supports arbitrary priority values.
			
 
				+
			
 
				+The @b{heft} (HEFT) scheduler is similar to dmda, it also supports task bundles.
			
 
				+
			
 
				+The @b{pheft} (parallel HEFT) scheduler is similar to heft, it also supports
			
 
				+parallel tasks (still experimental).
			
 
				+
			
 
				+The @b{pgreedy} (parallel greedy) scheduler is similar to greedy, it also
			
 
				+supports parallel tasks (still experimental).
			
 
				+
			
 
				+@node Performance model calibration
			
 
				+@section Performance model calibration
			
 
				+
			
 
				+Most schedulers are based on an estimation of codelet duration on each kind
			
 
				+of processing unit. For this to be possible, the application programmer needs
			
 
				+to configure a performance model for the codelets of the application (see
			
 
				+@ref{Performance model example} for instance). History-based performance models
			
 
				+use on-line calibration.  StarPU will automatically calibrate codelets
			
 
				+which have never been calibrated yet, and save the result in
			
 
				+@code{~/.starpu/sampling/codelets}.
			
 
				+The models are indexed by machine name. To share the models between machines (e.g. for a homogeneous cluster), use @code{export STARPU_HOSTNAME=some_global_name}. To force continuing calibration, use
			
 
				+@code{export STARPU_CALIBRATE=1} . This may be necessary if your application
			
 
				+has not-so-stable performance. StarPU will force calibration (and thus ignore
			
 
				+the current result) until 10 (STARPU_CALIBRATION_MINIMUM) measurements have been
			
 
				+made on each architecture, to avoid badly scheduling tasks just because the
			
 
				+first measurements were not so good. Details on the current performance model status
			
 
				+can be obtained from the @code{starpu_perfmodel_display} command: the @code{-l}
			
 
				+option lists the available performance models, and the @code{-s} option permits
			
 
				+to choose the performance model to be displayed. The result looks like:
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_display -s starpu_dlu_lu_model_22
			
 
				+performance model for cpu
			
 
				+# hash    size     mean          dev           n
			
 
				+880805ba  98304    2.731309e+02  6.010210e+01  1240
			
 
				+b50b6605  393216   1.469926e+03  1.088828e+02  1240
			
 
				+5c6c3401  1572864  1.125983e+04  3.265296e+03  1240
			
 
				+@end example
			
 
				+
			
 
				+Which shows that for the LU 22 kernel with a 1.5MiB matrix, the average
			
 
				+execution time on CPUs was about 12ms, with a 2ms standard deviation, over
			
 
				+1240 samples. It is a good idea to check this before doing actual performance
			
 
				+measurements.
			
 
				+
			
 
				+A graph can be drawn by using the @code{starpu_perfmodel_plot}:
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
			
 
				+98304 393216 1572864 
			
 
				+$ gnuplot starpu_starpu_dlu_lu_model_22.gp
			
 
				+$ gv starpu_starpu_dlu_lu_model_22.eps
			
 
				+@end example
			
 
				+
			
 
				+If a kernel source code was modified (e.g. performance improvement), the
			
 
				+calibration information is stale and should be dropped, to re-calibrate from
			
 
				+start. This can be done by using @code{export STARPU_CALIBRATE=2}.
			
 
				+
			
 
				+Note: due to CUDA limitations, to be able to measure kernel duration,
			
 
				+calibration mode needs to disable asynchronous data transfers. Calibration thus
			
 
				+disables data transfer / computation overlapping, and should thus not be used
			
 
				+for eventual benchmarks. Note 2: history-based performance models get calibrated
			
 
				+only if a performance-model-based scheduler is chosen.
			
 
				+
			
 
				+@node Task distribution vs Data transfer
			
 
				+@section Task distribution vs Data transfer
			
 
				+
			
 
				+Distributing tasks to balance the load induces data transfer penalty. StarPU
			
 
				+thus needs to find a balance between both. The target function that the
			
 
				+@code{dmda} scheduler of StarPU
			
 
				+tries to minimize is @code{alpha * T_execution + beta * T_data_transfer}, where
			
 
				+@code{T_execution} is the estimated execution time of the codelet (usually
			
 
				+accurate), and @code{T_data_transfer} is the estimated data transfer time. The
			
 
				+latter is estimated based on bus calibration before execution start,
			
 
				+i.e. with an idle machine, thus without contention. You can force bus re-calibration by running
			
 
				+@code{starpu_calibrate_bus}. The beta parameter defaults to 1, but it can be
			
 
				+worth trying to tweak it by using @code{export STARPU_BETA=2} for instance,
			
 
				+since during real application execution, contention makes transfer times bigger.
			
 
				+This is of course imprecise, but in practice, a rough estimation already gives
			
 
				+the good results that a precise estimation would give.
			
 
				+
			
 
				+@node Data prefetch
			
 
				+@section Data prefetch
			
 
				+
			
 
				+The @code{heft}, @code{dmda} and @code{pheft} scheduling policies perform data prefetch (see @ref{STARPU_PREFETCH}):
			
 
				+as soon as a scheduling decision is taken for a task, requests are issued to
			
 
				+transfer its required data to the target processing unit, if needeed, so that
			
 
				+when the processing unit actually starts the task, its data will hopefully be
			
 
				+already available and it will not have to wait for the transfer to finish.
			
 
				+
			
 
				+The application may want to perform some manual prefetching, for several reasons
			
 
				+such as excluding initial data transfers from performance measurements, or
			
 
				+setting up an initial statically-computed data distribution on the machine
			
 
				+before submitting tasks, which will thus guide StarPU toward an initial task
			
 
				+distribution (since StarPU will try to avoid further transfers).
			
 
				+
			
 
				+This can be achieved by giving the @code{starpu_data_prefetch_on_node} function
			
 
				+the handle and the desired target memory node.
			
 
				+
			
 
				+@node Power-based scheduling
			
 
				+@section Power-based scheduling
			
 
				+
			
 
				+If the application can provide some power performance model (through
			
 
				+the @code{power_model} field of the codelet structure), StarPU will
			
 
				+take it into account when distributing tasks. The target function that
			
 
				+the @code{dmda} scheduler minimizes becomes @code{alpha * T_execution +
			
 
				+beta * T_data_transfer + gamma * Consumption} , where @code{Consumption}
			
 
				+is the estimated task consumption in Joules. To tune this parameter, use
			
 
				+@code{export STARPU_GAMMA=3000} for instance, to express that each Joule
			
 
				+(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
			
 
				+@code{alpha} and @code{beta} to zero permits to only take into account power consumption.
			
 
				+
			
 
				+This is however not sufficient to correctly optimize power: the scheduler would
			
 
				+simply tend to run all computations on the most energy-conservative processing
			
 
				+unit. To account for the consumption of the whole machine (including idle
			
 
				+processing units), the idle power of the machine should be given by setting
			
 
				+@code{export STARPU_IDLE_POWER=200} for 200W, for instance. This value can often
			
 
				+be obtained from the machine power supplier.
			
 
				+
			
 
				+The power actually consumed by the total execution can be displayed by setting
			
 
				+@code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .
			
 
				+
			
 
				+@node Profiling
			
 
				+@section Profiling
			
 
				+
			
 
				+A quick view of how many tasks each worker has executed can be obtained by setting 
			
 
				+@code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
			
 
				+execution did happen on accelerators without penalizing performance with
			
 
				+the profiling overhead.
			
 
				+
			
 
				+A quick view of how much data transfers have been issued can be obtained by setting 
			
 
				+@code{export STARPU_BUS_STATS=1} .
			
 
				+
			
 
				+More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
			
 
				+calling @code{starpu_profiling_status_set} from the source code.
			
 
				+Statistics on the execution can then be obtained by using @code{export
			
 
				+STARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} .
			
 
				+ More details on performance feedback are provided by the next chapter.
			
 
				+
			
 
				+@node CUDA-specific optimizations
			
 
				+@section CUDA-specific optimizations
			
 
				+
			
 
				+Due to CUDA limitations, StarPU will have a hard time overlapping its own
			
 
				+communications and the codelet computations if the application does not use a
			
 
				+dedicated CUDA stream for its computations. StarPU provides one by the use of
			
 
				+@code{starpu_cuda_get_local_stream()} which should be used by all CUDA codelet
			
 
				+operations. For instance:
			
 
				+
			
 
				+@example
			
 
				+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
			
 
				+cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+@end example
			
 
				+
			
 
				+StarPU already does appropriate calls for the CUBLAS library.
			
 
				+
			
 
				+Unfortunately, some CUDA libraries do not have stream variants of
			
 
				+kernels. That will lower the potential for overlapping.
			
--- a/doc/scaling-vector-example.texi
+++ b/doc/scaling-vector-example.texi
@@ -0,0 +1,51 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Full source code for the 'Scaling a Vector' example
			
 
				+@appendix Full source code for the 'Scaling a Vector' example
			
 
				+
			
 
				+@menu
			
 
				+* Main application::            
			
 
				+* CPU Kernel::                 
			
 
				+* CUDA Kernel::                
			
 
				+* OpenCL Kernel::              
			
 
				+@end menu
			
 
				+
			
 
				+@node Main application
			
 
				+@section Main application
			
 
				+
			
 
				+@include vector_scal_c.texi
			
 
				+
			
 
				+@node CPU Kernel
			
 
				+@section CPU Kernel
			
 
				+
			
 
				+@include vector_scal_cpu.texi
			
 
				+
			
 
				+@node CUDA Kernel
			
 
				+@section CUDA Kernel
			
 
				+
			
 
				+@include vector_scal_cuda.texi
			
 
				+
			
 
				+@node OpenCL Kernel
			
 
				+@section OpenCL Kernel
			
 
				+
			
 
				+@menu
			
 
				+* Invoking the kernel::         
			
 
				+* Source of the kernel::        
			
 
				+@end menu
			
 
				+
			
 
				+@node Invoking the kernel
			
 
				+@subsection Invoking the kernel
			
 
				+
			
 
				+@include vector_scal_opencl.texi
			
 
				+
			
 
				+@node Source of the kernel
			
 
				+@subsection Source of the kernel
			
 
				+
			
 
				+@include vector_scal_opencl_codelet.texi
			
 
				+
			
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
--- a/doc/tips-tricks.texi
+++ b/doc/tips-tricks.texi
@@ -0,0 +1,74 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Tips and Tricks
			
 
				+@chapter Tips and Tricks to know about
			
 
				+
			
 
				+@menu
			
 
				+* Per-worker library initialization::  How to initialize a computation library once for each worker?
			
 
				+@end menu
			
 
				+
			
 
				+@node Per-worker library initialization
			
 
				+@section How to initialize a computation library once for each worker?
			
 
				+
			
 
				+Some libraries need to be initialized once for each concurrent instance that
			
 
				+may run on the machine. For instance, a C++ computation class which is not
			
 
				+thread-safe by itself, but for which several instanciated objects of that class
			
 
				+can be used concurrently. This can be used in StarPU by initializing one such
			
 
				+object per worker. For instance, the libstarpufft example does the following to
			
 
				+be able to use FFTW.
			
 
				+
			
 
				+Some global array stores the instanciated objects:
			
 
				+
			
 
				+@smallexample
			
 
				+fftw_plan plan_cpu[STARPU_NMAXWORKERS];
			
 
				+@end smallexample
			
 
				+
			
 
				+At initialisation time of libstarpu, the objects are initialized:
			
 
				+
			
 
				+@smallexample
			
 
				+int workerid;
			
 
				+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) @{
			
 
				+    switch (starpu_worker_get_type(workerid)) @{
			
 
				+        case STARPU_CPU_WORKER:
			
 
				+            plan_cpu[workerid] = fftw_plan(...);
			
 
				+            break;
			
 
				+    @}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+
			
 
				+And in the codelet body, they are used:
			
 
				+
			
 
				+@smallexample
			
 
				+static void fft(void *descr[], void *_args)
			
 
				+@{
			
 
				+    int workerid = starpu_worker_get_id();
			
 
				+    fftw_plan plan = plan_cpu[workerid];
			
 
				+    ...
			
 
				+
			
 
				+    fftw_execute(plan, ...);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+
			
 
				+Another way to go which may be needed is to execute some code from the workers
			
 
				+themselves thanks to @code{starpu_execute_on_each_worker}. This may be required
			
 
				+by CUDA to behave properly due to threading issues. For instance, StarPU's
			
 
				+@code{starpu_helper_cublas_init} looks like the following to call
			
 
				+@code{cublasInit} from the workers themselves:
			
 
				+
			
 
				+@smallexample
			
 
				+static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
			
 
				+@{
			
 
				+    cublasStatus cublasst = cublasInit();
			
 
				+    cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				+@}
			
 
				+void starpu_helper_cublas_init(void)
			
 
				+@{
			
 
				+    starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
			
 
				+@}
			
 
				+@end smallexample
			
--- a/doc/using.texi
+++ b/doc/using.texi
@@ -0,0 +1,110 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@node Using StarPU
			
 
				+@chapter Using StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Setting flags for compiling and linking applications::  
			
 
				+* Running a basic StarPU application::  
			
 
				+* Kernel threads started by StarPU::
			
 
				+* Enabling OpenCL::
			
 
				+@end menu
			
 
				+
			
 
				+@node Setting flags for compiling and linking applications
			
 
				+@section Setting flags for compiling and linking applications
			
 
				+
			
 
				+Compiling and linking an application against StarPU may require to use
			
 
				+specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
			
 
				+To this end, it is possible to use the @code{pkg-config} tool.
			
 
				+
			
 
				+If StarPU was not installed at some standard location, the path of StarPU's
			
 
				+library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
			
 
				+that @code{pkg-config} can find it. For example if StarPU was installed in
			
 
				+@code{$prefix_dir}:
			
 
				+
			
 
				+@example
			
 
				+% PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
			
 
				+@end example
			
 
				+
			
 
				+The flags required to compile or link against StarPU are then
			
 
				+accessible with the following commands:
			
 
				+
			
 
				+@example
			
 
				+% pkg-config --cflags libstarpu  # options for the compiler
			
 
				+% pkg-config --libs libstarpu    # options for the linker
			
 
				+@end example
			
 
				+
			
 
				+@node Running a basic StarPU application
			
 
				+@section Running a basic StarPU application
			
 
				+
			
 
				+Basic examples using StarPU are built in the directory
			
 
				+@code{examples/basic_examples/} (and installed in
			
 
				+@code{$prefix_dir/lib/starpu/examples/}). You can for example run the example
			
 
				+@code{vector_scal}.
			
 
				+
			
 
				+@example
			
 
				+% ./examples/basic_examples/vector_scal
			
 
				+BEFORE : First element was 1.000000
			
 
				+AFTER First element is 3.140000
			
 
				+%
			
 
				+@end example
			
 
				+
			
 
				+When StarPU is used for the first time, the directory
			
 
				+@code{$HOME/.starpu/} is created, performance models will be stored in
			
 
				+that directory.
			
 
				+
			
 
				+Please note that buses are benchmarked when StarPU is launched for the
			
 
				+first time. This may take a few minutes, or less if @code{hwloc} is
			
 
				+installed. This step is done only once per user and per machine.
			
 
				+
			
 
				+@node Kernel threads started by StarPU
			
 
				+@section Kernel threads started by StarPU
			
 
				+
			
 
				+StarPU automatically binds one thread per CPU core. It does not use
			
 
				+SMT/hyperthreading because kernels are usually already optimized for using a
			
 
				+full core, and using hyperthreading would make kernel calibration rather random.
			
 
				+
			
 
				+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
			
 
				+
			
 
				+While StarPU tasks are executing, the application is not supposed to do
			
 
				+computations in the threads it starts itself, tasks should be used instead.
			
 
				+
			
 
				+TODO: add a StarPU function to bind an application thread (e.g. the main thread)
			
 
				+to a dedicated core (and thus disable the corresponding StarPU CPU worker).
			
 
				+
			
 
				+@node Enabling OpenCL
			
 
				+@section Enabling OpenCL
			
 
				+
			
 
				+When both CUDA and OpenCL drivers are enabled, StarPU will launch an
			
 
				+OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
			
 
				+This design choice was necessary as OpenCL and CUDA can not run at the
			
 
				+same time on the same NVIDIA GPU, as there is currently no interoperability
			
 
				+between them.
			
 
				+
			
 
				+To enable OpenCL, you need either to disable CUDA when configuring StarPU:
			
 
				+
			
 
				+@example
			
 
				+% ./configure --disable-cuda
			
 
				+@end example
			
 
				+
			
 
				+or when running applications:
			
 
				+
			
 
				+@example
			
 
				+% STARPU_NCUDA=0 ./application
			
 
				+@end example
			
 
				+
			
 
				+OpenCL will automatically be started on any device not yet used by
			
 
				+CUDA. So on a machine running 4 GPUS, it is therefore possible to
			
 
				+enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
			
 
				+so:
			
 
				+
			
 
				+@example
			
 
				+% STARPU_NCUDA=2 ./application
			
 
				+@end example
			
 
				+
			
--- a/doc/version.texi
+++ b/doc/version.texi
@@ -0,0 +1,4 @@
 
				+@set UPDATED 17 November 2011
			
 
				+@set UPDATED-MONTH November 2011
			
 
				+@set EDITION 1.0.0
			
 
				+@set VERSION 1.0.0