Pārlūkot izejas kodu

merge trunk@6801:6867

Nathalie Furmento 12 gadi atpakaļ
vecāks
revīzija
bbad3cd73b
41 mainītis faili ar 1298 papildinājumiem un 482 dzēšanām
  1. 6 0
      ChangeLog
  2. 15 1
      doc/Makefile.am
  3. 91 8
      doc/chapters/advanced-examples.texi
  4. 24 6
      doc/chapters/basic-api.texi
  5. 4 2
      doc/chapters/basic-examples.texi
  6. 4 4
      doc/chapters/configuration.texi
  7. 51 0
      doc/chapters/perf-optimization.texi
  8. 1 1
      doc/starpu.texi
  9. 2 2
      doc/tutorial/README
  10. 17 2
      examples/Makefile.am
  11. 1 1
      examples/openmp/vector_scal.c
  12. 254 0
      examples/pipeline/pipeline.c
  13. 4 1
      include/starpu_perfmodel.h
  14. 134 118
      mpi/examples/cholesky/mpi_cholesky.c
  15. 16 2
      mpi/examples/cholesky/mpi_cholesky.h
  16. 221 222
      mpi/starpu_mpi_insert_task.c
  17. 6 8
      mpi/starpu_mpi_insert_task_cache.c
  18. 3 2
      mpi/starpu_mpi_insert_task_cache.h
  19. 1 1
      socl/src/cl_createbuffer.c
  20. 2 2
      socl/src/devices.c
  21. 6 6
      socl/src/socl.c
  22. 2 0
      src/Makefile.am
  23. 1 1
      src/common/htable32.c
  24. 2 2
      src/common/htable32.h
  25. 106 0
      src/common/htable64.c
  26. 46 0
      src/common/htable64.h
  27. 2 2
      src/core/dependencies/htable.h
  28. 12 0
      src/core/jobs.c
  29. 2 2
      src/core/perfmodel/perfmodel.c
  30. 54 15
      src/core/perfmodel/perfmodel_bus.c
  31. 12 2
      src/core/perfmodel/perfmodel_history.c
  32. 17 6
      src/core/task.c
  33. 10 27
      src/core/workers.c
  34. 5 1
      src/drivers/cpu/driver_cpu.h
  35. 6 1
      src/drivers/cuda/driver_cuda.h
  36. 13 1
      src/drivers/opencl/driver_opencl.h
  37. 6 2
      src/drivers/opencl/driver_opencl_utils.c
  38. 5 1
      tests/Makefile.am
  39. 85 0
      tests/perfmodels/feed.c
  40. 1 1
      tools/dev/starpu_check_undocumented.sh
  41. 48 29
      tools/starpu_machine_display.c

+ 6 - 0
ChangeLog

@@ -21,10 +21,16 @@ New features:
   * OpenGL interoperability support.
   * Capability to store compiled OpenCL kernels on the file system
   * Capability to load compiled OpenCL kernels
+  * Performance models measurements can now be provided explicitly by
+    applications.
 
 Changes:
   * The FxT code can now be used on systems other than Linux.
 
+Small changes:
+  * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
+	still available for compatibility reasons.
+
 StarPU 1.0.0 (svn revision 6306)
 ==============================================
 The extensions-again release

+ 15 - 1
doc/Makefile.am

@@ -34,7 +34,8 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 	chapters/using.texi \
 	chapters/vector_scal_opencl.texi \
 	chapters/socl.texi \
-	chapters/sched_ctx_hypervisor.texi
+	chapters/sched_ctx_hypervisor.texi \
+	chapters/version.texi
 
 MAINTAINERCLEANFILES = starpu.pdf
 
@@ -46,6 +47,19 @@ AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers -
 uninstall-local:
 	$(RM) $(DESTDIR)$(infodir)/dir
 
+#TODO: when stat is not available on the machine, insert "unknown date"
+chapters/version.texi:
+	@for f in $(starpu_TEXINFOS) ; do \
+                if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f ; fi \
+        done | sort -r | head -1 > timestamp
+	@LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated
+	@LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month
+	@echo "@set UPDATED " `cat timestamp_updated` > $(top_srcdir)/doc/chapters/version.texi
+	@echo "@set UPDATED-MONTH" `cat timestamp_updated_month` >> $(top_srcdir)/doc/chapters/version.texi
+	@echo "@set EDITION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
+	@echo "@set VERSION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
+	@$(RM) timestamp timestamp_updated timestamp_updated_month
+
 #$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
 #vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
 #	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@

+ 91 - 8
doc/chapters/advanced-examples.texi

@@ -15,6 +15,7 @@
 * Theoretical lower bound on execution time::  
 * Insert Task Utility::          
 * Data reduction::  
+* Temporary buffers::  
 * Parallel Tasks::
 * Debugging::
 * The multiformat interface::
@@ -302,7 +303,9 @@ a performance model, by defining a @code{starpu_perfmodel} structure and
 providing its address in the @code{model} field of the @code{struct starpu_codelet}
 structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
 are mandatory, to give a name to the model, and the type of the model, since
-there are several kinds of performance models.
+there are several kinds of performance models. For compatibility, make sure to
+initialize the whole structure to zero, either by using explicit memset, or by
+letting the compiler implicitly do it as examplified below.
 
 @itemize
 @item
@@ -317,9 +320,13 @@ and ouput sizes as an index.
 It will also save it in @code{~/.starpu/sampling/codelets}
 for further executions, and can be observed by using the
 @code{starpu_perfmodel_display} command, or drawn by using
-the @code{starpu_perfmodel_plot}.  The models are indexed by machine name. To
+the @code{starpu_perfmodel_plot} (@pxref{Performance model calibration}).  The
+models are indexed by machine name. To
 share the models between machines (e.g. for a homogeneous cluster), use
-@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
+@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done
+when using a task scheduler which makes use of it, such as @code{heft} or
+@code{dmda}. Measurements can also be provided explicitly by the application, by
+using the @code{starpu_perfmodel_update_history} function.
 
 The following is a small code example.
 
@@ -353,15 +360,33 @@ model type). This still assumes performance regularity, but works
 with various data input sizes, by applying regression over observed
 execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
 form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
-STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
+STARPU_REGRESSION_BASED, but costs a lot more to compute).
+
+For instance,
 @code{tests/perfmodels/regression_based.c} uses a regression-based performance
-model for the @code{memset} operation. Of course, the application has to issue
+model for the @code{memset} operation.
+
+Of course, the application has to issue
 tasks with varying size so that the regression can be computed. StarPU will not
 trust the regression unless there is at least 10% difference between the minimum
-and maximum observed input size. For non-linear regression, since computing it
+and maximum observed input size. It can be useful to set the
+@code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
+on varying input sizes, so as to feed the performance model for a variety of
+inputs, or to provide the measurements explictly by using
+@code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and
+@code{starpu_perfmodel_plot}
+tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
+their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
+StarPU use the resulting performance model without recording new measures. If
+the data input sizes vary a lot, it is really important to set
+@code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
+measures, and result with a very big performance model, which will take time a
+lot of time to load and save.
+
+For non-linear regression, since computing it
 is quite expensive, it is only done at termination of the application. This
-means that the first execution uses history-based performance model to perform
-scheduling.
+means that the first execution of the application will use only history-based
+performance model to perform scheduling, without using regression.
 
 @item
 Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),
@@ -668,6 +693,64 @@ int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
 The @code{cg} example also uses reduction for the blocked gemv kernel, leading
 to yet more relaxed dependencies and more parallelism.
 
+@node Temporary buffers
+@section Temporary buffers
+
+There are two kinds of temporary buffers: temporary data which just pass results
+from a task to another, and scratch data which are needed only internally by
+tasks.
+
+@subsection Temporary data
+
+Data can sometimes be entirely produced by a task, and entirely consumed by
+another task, without the need for other parts of the application to access
+it. In such case, registration can be done without prior allocation, by using
+the special -1 memory node number, and passing a zero pointer. StarPU will
+actually allocate memory only when the task creating the content gets scheduled,
+and destroy it on unregistration.
+
+In addition to that, it can be tedious for the application to have to unregister
+the data, since it will not use its content anyway. The unregistration can be
+done lazily by using the @code{starpu_data_unregister_lazy(handle)} function,
+which will record that no more tasks accessing the handle will be submitted, so
+that it can be freed as soon as the last task accessing it is over.
+
+The following code examplifies both points: it registers the temporary
+data, submits three tasks accessing it, and records the data for automatic
+unregistration.
+
+@smallexample
+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
+starpu_insert_task(&produce_data, STARPU_W, handle, 0);
+starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
+starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
+starpu_data_unregister_lazy(handle);
+@end smallexample
+
+@subsection Scratch data
+
+Some kernels sometimes need temporary data to achieve the computations, i.e. a
+workspace. The application could allocate it at the start of the codelet
+function, and free it at the end, but that would be costly. It could also
+allocate one buffer per worker (similarly to @ref{Per-worker library
+initialization }), but that would make them systematic and permanent. A more
+optimized way is to use the SCRATCH data access mode, as examplified below,
+which provides per-worker buffers without content consistency.
+
+@smallexample
+starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
+for (i = 0; i < N; i++)
+    starpu_insert_task(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
+@end smallexample
+
+StarPU will make sure that the buffer is allocated before executing the task,
+and make this allocation per-worker: for CPU workers, notably, each worker has
+its own buffer. This means that each task submitted above will actually have its
+own workspace, which will actually be the same for all tasks running one after
+the other on the same worker. Also, if for instance GPU memory becomes scarce,
+StarPU will notice that it can free such buffers easily, since the content does
+not matter.
+
 @node Parallel Tasks
 @section Parallel Tasks
 

+ 24 - 6
doc/chapters/basic-api.texi

@@ -72,7 +72,7 @@ if @code{sched_policy_name} is set.
 
 @item @code{int ncpus} (default = -1)
 This is the number of CPU cores that StarPU can use. This can also be
-specified with the @code{STARPU_NCPUS} environment variable.
+specified with the @code{STARPU_NCPU} environment variable.
 
 @item @code{int ncuda} (default = -1)
 This is the number of CUDA devices that StarPU can use. This can also
@@ -127,7 +127,7 @@ executing tasks. If this value is equal to -1, the default value is used. This
 can also be specified with the @code{STARPU_CALIBRATE} environment variable.
 
 @item @code{int single_combined_worker} (default = 0)
-By default, StarPU parallel tasks concurrently.
+By default, StarPU executes parallel tasks concurrently.
 Some parallel libraries (e.g. most OpenMP implementations) however do
 not support concurrent calls to parallel code. In such case, setting this flag
 makes StarPU only start one parallel task at a time.
@@ -615,8 +615,8 @@ Compressed Sparse Row Representation) sparse matrix interface.
 Register the sparse matrix made of @var{nnz} non-zero blocks of elements of size
 @var{elemsize} stored in @var{nzval} and initializes @var{handle} to represent
 it. Blocks have size @var{r} * @var{c}. @var{nrow} is the number of rows (in
-terms of blocks), @var{colind[i]} is the block-column index for block @code{i}
-in @code{nzval}, @var{rowptr[i]} is the block-index (in nzval) of the first block of row @code{i}.
+terms of blocks), @code{colind[i]} is the block-column index for block @code{i}
+in @code{nzval}, @code{rowptr[i]} is the block-index (in nzval) of the first block of row @code{i}.
 @var{firstentry} is the index of the first entry of the given arrays (usually 0
 or 1).
 @end deftypefun
@@ -1259,7 +1259,9 @@ always only define the field @code{opencl_funcs}.
 
 @deftp {Data Type} {struct starpu_codelet}
 The codelet structure describes a kernel that is possibly implemented on various
-targets. For compatibility, make sure to initialize the whole structure to zero.
+targets. For compatibility, make sure to initialize the whole structure to zero,
+either by using explicit memset, or by letting the compiler implicitly do it in
+e.g. static storage case.
 
 @table @asis
 @item @code{uint32_t where} (optional)
@@ -1826,7 +1828,11 @@ The possible values are:
 @anchor{struct starpu_perfmodel}
 contains all information about a performance model. At least the
 @code{type} and @code{symbol} fields have to be filled when defining a
-performance model for a codelet. If not provided, other fields have to be zero.
+performance model for a codelet. For compatibility, make sure to initialize the
+whole structure to zero, either by using explicit memset, or by letting the
+compiler implicitly do it in e.g. static storage case.
+
+If not provided, other fields have to be zero.
 
 @table @asis
 @item @code{type}
@@ -1952,6 +1958,18 @@ prints a list of all performance models on @var{output}.
 prints a matrix of bus bandwidths on @var{f}.
 @end deftypefun
 
+@deftypefun void starpu_bus_print_affinity ({FILE *}@var{f})
+prints the affinity devices on @var{f}.
+@end deftypefun
+
+@deftypefun void starpu_perfmodel_update_history ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{cpuid}, unsigned @var{nimpl}, double @var{measured});
+This feeds the performance model @var{model} with an explicit measurement
+@var{measured}, in addition to measurements done by StarPU itself. This can be
+useful when the application already has an existing set of measurements done
+in good conditions, that StarPU could benefit from instead of doing on-line
+measurements. And example of use can be see in @ref{Performance model example}.
+@end deftypefun
+
 @node Profiling API
 @section Profiling API
 

+ 4 - 2
doc/chapters/basic-examples.texi

@@ -161,7 +161,9 @@ struct starpu_codelet cl =
 
 A codelet is a structure that represents a computational kernel. Such a codelet
 may contain an implementation of the same kernel on different architectures
-(e.g. CUDA, Cell's SPU, x86, ...).
+(e.g. CUDA, Cell's SPU, x86, ...). For compatibility, make sure that the whole
+structure is initialized to zero, either by using memset, or by letting the
+compiler implicitly do it as examplified above.
 
 The @code{nbuffers} field specifies the number of data buffers that are
 manipulated by the codelet: here the codelet does not access or modify any data
@@ -970,7 +972,7 @@ and to execute it, with the default configuration:
 or for example, by disabling CPU devices:
 
 @smallexample
-% STARPU_NCPUS=0 ./vector_scal
+% STARPU_NCPU=0 ./vector_scal
 0.000000 3.000000 6.000000 9.000000 12.000000
 @end smallexample
 

+ 4 - 4
doc/chapters/configuration.texi

@@ -223,7 +223,7 @@ By default, it is disabled.
 @subsection Configuring workers
 
 @menu
-* STARPU_NCPUS::                	Number of CPU workers
+* STARPU_NCPU::                	Number of CPU workers
 * STARPU_NCUDA::                	Number of CUDA workers
 * STARPU_NOPENCL::              	Number of OpenCL workers
 * STARPU_NGORDON::              	Number of SPU workers (Cell)
@@ -236,8 +236,8 @@ By default, it is disabled.
 * STARPU_MAX_WORKERSIZE:: 		Maximum size of the combined workers
 @end menu
 
-@node STARPU_NCPUS
-@subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
+@node STARPU_NCPU
+@subsubsection @code{STARPU_NCPU} -- Number of CPU workers
 
 Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
 more CPU workers than there are physical CPUs, and that some CPUs are used to control
@@ -281,7 +281,7 @@ available.
 
 Note that the first workers correspond to the CUDA workers, then come the
 OpenCL and the SPU, and finally the CPU workers. For example if
-we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
+we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
 the logical CPUs #1 and #3 will be used by the CPU workers.

+ 51 - 0
doc/chapters/perf-optimization.texi

@@ -244,6 +244,46 @@ disables data transfer / computation overlapping, and should thus not be used
 for eventual benchmarks. Note 2: history-based performance models get calibrated
 only if a performance-model-based scheduler is chosen.
 
+The history-based performance models can also be explicitly filled by the
+application without execution, if e.g. the application already has a series of
+measurements. This can be done by using @code{starpu_perfmodel_update_history},
+for instance:
+
+@example
+static struct starpu_perfmodel perf_model = @{
+    .type = STARPU_HISTORY_BASED,
+    .symbol = "my_perfmodel",
+@};
+
+struct starpu_codelet cl = @{
+    .where = STARPU_CUDA,
+    .cuda_funcs = @{ cuda_func1, cuda_func2, NULL @},
+    .nbuffers = 1,
+    .modes = @{STARPU_W@},
+    .model = &perf_model
+@};
+
+void feed(void) @{
+    struct my_measure *measure;
+    struct starpu_task task;
+    starpu_task_init(&task);
+
+    task.cl = &cl;
+
+    for (measure = &measures[0]; measure < measures[last]; measure++) @{
+        starpu_data_handle_t handle;
+	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
+	task.handles[0] = handle;
+	starpu_perfmodel_update_history(&perf_model, &task, STARPU_CUDA_DEFAULT + measure->cudadev, 0, measure->implementation, measure->time);
+	starpu_task_deinit(&task);
+	starpu_data_unregister(handle);
+    @}
+@}
+@end example
+
+Measurement has to be provided in milliseconds for the completion time models,
+and in Joules for the energy consumption models.
+
 @node Task distribution vs Data transfer
 @section Task distribution vs Data transfer
 
@@ -302,6 +342,17 @@ be obtained from the machine power supplier.
 The power actually consumed by the total execution can be displayed by setting
 @code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .
 
+On-line task consumption measurement is currently only supported through the
+@code{CL_PROFILING_POWER_CONSUMED} OpenCL extension, implemented in the MoviSim
+simulator. Applications can however provide explicit measurements by using the
+@code{starpu_perfmodel_update_history} function (examplified in @ref{Performance
+model example} with the @code{power_model} performance model. Fine-grain
+measurement is often not feasible with the feedback provided by the hardware, so
+the user can for instance run a given task a thousand times, measure the global
+consumption for that series of tasks, divide it by a thousand, repeat for
+varying kinds of tasks and task sizes, and eventually feed StarPU
+with these manual measurements through @code{starpu_perfmodel_update_history}.
+
 @node Profiling
 @section Profiling
 

+ 1 - 1
doc/starpu.texi

@@ -5,7 +5,7 @@
 @settitle StarPU Handbook
 @c %**end of header
 
-@include version.texi
+@include chapters/version.texi
 
 @copying
 Copyright @copyright{} 2009--2011  Universit@'e de Bordeaux 1

+ 2 - 2
doc/tutorial/README

@@ -41,6 +41,6 @@ Instructions on how to compile and run StarPU examples
 % make vector_scal
 % ./vector_scal
 
-% STARPU_NCPUS=0 ./vector_scal
-% STARPU_NCPUS=0 STARPU_NCUDA=0 ./vector_scal
+% STARPU_NCPU=0 ./vector_scal
+% STARPU_NCPU=0 STARPU_NCUDA=0 ./vector_scal
 

+ 17 - 2
examples/Makefile.am

@@ -220,7 +220,8 @@ examplebin_PROGRAMS +=				\
 	lu/lu_implicit_example_float		\
 	lu/lu_implicit_example_double		\
 	heat/heat				\
-	cg/cg
+	cg/cg					\
+	pipeline/pipeline
 endif
 
 if MKL_BLAS_LIB
@@ -283,7 +284,8 @@ STARPU_EXAMPLES +=				\
 	lu/lu_implicit_example_float		\
 	lu/lu_implicit_example_double		\
 	heat/heat				\
-	cg/cg
+	cg/cg					\
+	pipeline/pipeline
 endif
 
 if MKL_BLAS_LIB
@@ -851,6 +853,19 @@ gl_interop_gl_interop_LDADD =			\
 	$(STARPU_OPENGL_RENDER_LDFLAGS)
 endif
 
+####################
+# pipeline example #
+####################
+
+if !NO_BLAS_LIB
+pipeline_pipeline_SOURCES	=	\
+	pipeline/pipeline.c		\
+	common/blas.c
+
+pipeline_pipeline_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+endif
+
 showcheck:
 	-cat $(TEST_LOGS) /dev/null
 	for i in $(SUBDIRS) ; do \

+ 1 - 1
examples/openmp/vector_scal.c

@@ -83,7 +83,7 @@ int main(int argc, char **argv)
 	starpu_conf_init(&conf);
 
 	/* Most OpenMP implementations do not support concurrent parallel
-	 * sections, so only create one big worker */
+	 * sections, so only enable one combined worker at a time.  */
 	conf.single_combined_worker = 1;
 	conf.sched_policy_name = "pheft";
 

+ 254 - 0
examples/pipeline/pipeline.c

@@ -0,0 +1,254 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examples shows how to submit a pipeline to StarPU with limited buffer
+ * use, and avoiding submitted all the tasks at once.
+ *
+ * This is a dumb example pipeline, depicted here:
+ *
+ * x--\
+ *     >==axpy-->sum
+ * y--/
+ *
+ * x and y produce vectors full of x and y values, axpy multiplies them, and sum
+ * sums it up. We thus have 3 temporary buffers
+ */
+
+#include <starpu.h>
+#include <stdint.h>
+#include <semaphore.h>
+#include <common/blas.h>
+
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#endif
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+/* Vector size */
+#ifdef STARPU_SLOW_MACHINE
+#define N 16
+#else
+#define N 1048576
+#endif
+
+/* Number of iteration buffers, and thus overlapped pipeline iterations */
+#define K 16
+
+/* Number of concurrently submitted pipeline iterations */
+#define C 64
+
+/* Number of iterations */
+#define L 256
+
+/* X / Y codelets */
+void pipeline_cpu_x(void *descr[], void *args)
+{
+	float x;
+	float *val = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+	int i;
+
+	starpu_codelet_unpack_args(args, &x);
+	for (i = 0; i < n ; i++)
+		val[i] = x;
+}
+
+static struct starpu_perfmodel pipeline_model_x =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "pipeline_model_x"
+};
+
+static struct starpu_codelet pipeline_codelet_x =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {pipeline_cpu_x, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_W},
+	.model = &pipeline_model_x
+};
+
+/* axpy codelets */
+void pipeline_cpu_axpy(void *descr[], void *arg)
+{
+	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	SAXPY(n, 1., x, 1, y, 1);
+}
+
+#ifdef STARPU_USE_CUDA
+void pipeline_cublas_axpy(void *descr[], void *arg)
+{
+	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	cublasSaxpy(n, 1., x, 1, y, 1);
+}
+#endif
+
+static struct starpu_perfmodel pipeline_model_axpy =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "pipeline_model_axpy"
+};
+
+static struct starpu_codelet pipeline_codelet_axpy =
+{
+	.where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+		| STARPU_CUDA
+#endif
+		,
+	.cpu_funcs = {pipeline_cpu_axpy, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {pipeline_cublas_axpy, NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &pipeline_model_axpy
+};
+
+/* sum codelet */
+void pipeline_cpu_sum(void *descr[], void *_args)
+{
+	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+	float y;
+
+	y = SASUM(n, x, 1);
+
+	FPRINTF(stderr,"CPU finished with %f\n", y);
+}
+
+#ifdef STARPU_USE_CUDA
+void pipeline_cublas_sum(void *descr[], void *arg)
+{
+	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
+	int n = STARPU_VECTOR_GET_NX(descr[0]);
+	float y;
+
+	y = cublasSasum(n, x, 1);
+
+	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
+}
+#endif
+
+static struct starpu_perfmodel pipeline_model_sum =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "pipeline_model_sum"
+};
+
+static struct starpu_codelet pipeline_codelet_sum =
+{
+	.where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+		| STARPU_CUDA
+#endif
+		,
+	.cpu_funcs = {pipeline_cpu_sum, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {pipeline_cublas_sum, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_R},
+	.model = &pipeline_model_sum
+};
+
+int main(void)
+{
+	int ret = 0;
+	int k, l, c;
+	starpu_data_handle_t buffersX[K], buffersY[K], buffersP[K];
+	sem_t sems[C];
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_helper_cublas_init();
+
+	/* Initialize the K temporary buffers. No need to allocate it ourselves
+	 * Since it's the X and Y kernels which will fill the initial values. */
+	for (k = 0; k < K; k++)
+	{
+		starpu_vector_data_register(&buffersX[k], -1, 0, N, sizeof(float));
+		starpu_vector_data_register(&buffersY[k], -1, 0, N, sizeof(float));
+		starpu_vector_data_register(&buffersP[k], -1, 0, N, sizeof(float));
+	}
+
+	/* Initialize way to wait for the C previous concurrent stages */
+	for (c = 0; c < C; c++)
+		sem_init(&sems[c], 0, 0);
+
+	/* Submits the l pipeline stages */
+	for (l = 0; l < L; l++)
+	{
+		float x = l;
+		float y = 2*l;
+		/* First wait for the C previous concurrent stages */
+		if (l >= C)
+			sem_wait(&sems[l%C]);
+
+		/* Now submit the next stage */
+		ret = starpu_insert_task(&pipeline_codelet_x,
+				STARPU_W, buffersX[l%K],
+				STARPU_VALUE, &x, sizeof(x),
+				0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task x");
+
+		ret = starpu_insert_task(&pipeline_codelet_x,
+				STARPU_W, buffersY[l%K],
+				STARPU_VALUE, &y, sizeof(y),
+				0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task y");
+
+		ret = starpu_insert_task(&pipeline_codelet_axpy,
+				STARPU_R, buffersX[l%K],
+				STARPU_RW, buffersY[l%K],
+				0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task axpy");
+
+		ret = starpu_insert_task(&pipeline_codelet_sum,
+				STARPU_R, buffersY[l%K],
+				STARPU_CALLBACK_WITH_ARG, (void (*)(void*))sem_post, &sems[l%C],
+				0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task sum");
+	}
+	starpu_task_wait_for_all();
+
+enodev:
+	for (k = 0; k < K; k++)
+	{
+		starpu_data_unregister(buffersX[k]);
+		starpu_data_unregister(buffersY[k]);
+		starpu_data_unregister(buffersP[k]);
+	}
+	starpu_shutdown();
+
+	return (ret == -ENODEV ? 77 : 0);
+}

+ 4 - 1
include/starpu_perfmodel.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -209,8 +209,11 @@ void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_
 void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
 int starpu_list_models(FILE *output);
 double starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, uint32_t footprint);
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *, enum starpu_perf_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
+
 void starpu_force_bus_sampling(void);
 void starpu_bus_print_bandwidth(FILE *f);
+void starpu_bus_print_affinity(FILE *f);
 
 #ifdef __cplusplus
 }

+ 134 - 118
mpi/examples/cholesky/mpi_cholesky.c

@@ -63,7 +63,8 @@ static struct starpu_codelet cl22 =
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int y, int nb_nodes)
 {
-        return (x+y) % nb_nodes;
+	//return (x+y) % nb_nodes;
+	return (x%dblockx)+(y%dblocky)*dblockx;
 }
 
 /*
@@ -74,93 +75,93 @@ static void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblo
 {
 	struct timeval start;
 	struct timeval end;
-        starpu_data_handle_t **data_handles;
-        int x, y;
+	starpu_data_handle_t **data_handles;
+	int x, y;
 
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-        data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
-        for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
+	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
+	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
 
-        for(x = 0; x < nblocks ;  x++)
+	for(x = 0; x < nblocks ;  x++)
 	{
-                for (y = 0; y < nblocks; y++)
+		for (y = 0; y < nblocks; y++)
 		{
-                        int mpi_rank = my_distrib(x, y, nodes);
-                        if (mpi_rank == rank)
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (mpi_rank == rank)
 			{
-                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-                                starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
-                                                            ld, size/nblocks, size/nblocks, sizeof(float));
-                        }
+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
+						ld, size/nblocks, size/nblocks, sizeof(float));
+			}
 			/* TODO: make better test to only registering what is needed */
-                        else
+			else
 			{
-                                /* I don't own that index, but will need it for my computations */
-                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
-                                starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
-                                                            ld, size/nblocks, size/nblocks, sizeof(float));
-                        }
-                        if (data_handles[x][y])
+				/* I don't own that index, but will need it for my computations */
+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
+						ld, size/nblocks, size/nblocks, sizeof(float));
+			}
+			if (data_handles[x][y])
 			{
-                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
-                                starpu_data_set_tag(data_handles[x][y], (y*nblocks)+x);
+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
+				starpu_data_set_tag(data_handles[x][y], (y*nblocks)+x);
 			}
-                }
-        }
+		}
+	}
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	gettimeofday(&start, NULL);
 
 	for (k = 0; k < nblocks; k++)
-        {
-                int prio = STARPU_DEFAULT_PRIO;
-                if (!noprio) prio = STARPU_MAX_PRIO;
+	{
+		int prio = STARPU_DEFAULT_PRIO;
+		if (!noprio) prio = STARPU_MAX_PRIO;
 
-                starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
-                                       STARPU_PRIORITY, prio,
-                                       STARPU_RW, data_handles[k][k],
-                                       0);
+		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
+				STARPU_PRIORITY, prio,
+				STARPU_RW, data_handles[k][k],
+				0);
 
 		for (j = k+1; j<nblocks; j++)
 		{
-                        prio = STARPU_DEFAULT_PRIO;
-                        if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
-                        starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
-                                               STARPU_PRIORITY, prio,
-                                               STARPU_R, data_handles[k][k],
-                                               STARPU_RW, data_handles[k][j],
-                                               0);
+			prio = STARPU_DEFAULT_PRIO;
+			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
+			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
+					STARPU_PRIORITY, prio,
+					STARPU_R, data_handles[k][k],
+					STARPU_RW, data_handles[k][j],
+					0);
 
 			for (i = k+1; i<nblocks; i++)
 			{
 				if (i <= j)
-                                {
-                                        prio = STARPU_DEFAULT_PRIO;
-                                        if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
-                                        starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
-                                                               STARPU_PRIORITY, prio,
-                                                               STARPU_R, data_handles[k][i],
-                                                               STARPU_R, data_handles[k][j],
-                                                               STARPU_RW, data_handles[i][j],
-                                                               0);
-                                }
+				{
+					prio = STARPU_DEFAULT_PRIO;
+					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
+					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
+							STARPU_PRIORITY, prio,
+							STARPU_R, data_handles[k][i],
+							STARPU_R, data_handles[k][j],
+							STARPU_RW, data_handles[i][j],
+							0);
+				}
 			}
 		}
-        }
+	}
 
-        starpu_task_wait_for_all();
+	starpu_task_wait_for_all();
 
-        for(x = 0; x < nblocks ;  x++)
+	for(x = 0; x < nblocks ;  x++)
 	{
-                for (y = 0; y < nblocks; y++)
+		for (y = 0; y < nblocks; y++)
 		{
-                        if (data_handles[x][y])
-                                starpu_data_unregister(data_handles[x][y]);
-                }
+			if (data_handles[x][y])
+				starpu_data_unregister(data_handles[x][y]);
+		}
 		free(data_handles[x]);
-        }
+	}
 	free(data_handles);
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
@@ -185,7 +186,7 @@ int main(int argc, char **argv)
 	 * */
 
 	float ***bmat;
-        int rank, nodes;
+	int rank, nodes;
 
 	parse_args(argc, argv);
 
@@ -201,19 +202,34 @@ int main(int argc, char **argv)
 	starpu_mpi_initialize_extended(&rank, &nodes);
 	starpu_helper_cublas_init();
 
+	if (dblockx == -1 || dblocky == -1)
+	{
+	     int factor;
+	     dblockx = nodes;
+	     dblocky = 1;
+	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
+	     {
+		  if (nodes % factor == 0)
+		  {
+		       dblockx = nodes/factor;
+		       dblocky = factor;
+		  }
+	     }
+	}
+
 	unsigned i,j,x,y;
-        bmat = malloc(nblocks * sizeof(float *));
-        for(x=0 ; x<nblocks ; x++)
+	bmat = malloc(nblocks * sizeof(float *));
+	for(x=0 ; x<nblocks ; x++)
 	{
-                bmat[x] = malloc(nblocks * sizeof(float *));
-                for(y=0 ; y<nblocks ; y++)
+		bmat[x] = malloc(nblocks * sizeof(float *));
+		for(y=0 ; y<nblocks ; y++)
 		{
-                        starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+			starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
 			for (i = 0; i < BLOCKSIZE; i++)
 			{
 				for (j = 0; j < BLOCKSIZE; j++)
 				{
-                                        bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
+					bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
 					//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
 				}
 			}
@@ -221,15 +237,15 @@ int main(int argc, char **argv)
 	}
 
 
-        if (display)
+	if (display)
 	{
-                printf("[%d] Input :\n", rank);
+		printf("[%d] Input :\n", rank);
 
 		for(y=0 ; y<nblocks ; y++)
 		{
 			for(x=0 ; x<nblocks ; x++)
 			{
-                                printf("Block %d,%d :\n", x, y);
+				printf("Block %d,%d :\n", x, y);
 				for (j = 0; j < BLOCKSIZE; j++)
 				{
 					for (i = 0; i < BLOCKSIZE; i++)
@@ -253,14 +269,14 @@ int main(int argc, char **argv)
 
 	starpu_mpi_shutdown();
 
-        if (display)
+	if (display)
 	{
-                printf("[%d] Results :\n", rank);
+		printf("[%d] Results :\n", rank);
 		for(y=0 ; y<nblocks ; y++)
 		{
 			for(x=0 ; x<nblocks ; x++)
 			{
-                                printf("Block %d,%d :\n", x, y);
+				printf("Block %d,%d :\n", x, y);
 				for (j = 0; j < BLOCKSIZE; j++)
 				{
 					for (i = 0; i < BLOCKSIZE; i++)
@@ -281,19 +297,19 @@ int main(int argc, char **argv)
 	}
 
 	float *rmat = malloc(size*size*sizeof(float));
-        for(x=0 ; x<nblocks ; x++)
+	for(x=0 ; x<nblocks ; x++)
 	{
-                for(y=0 ; y<nblocks ; y++)
+		for(y=0 ; y<nblocks ; y++)
 		{
-                        for (i = 0; i < BLOCKSIZE; i++)
+			for (i = 0; i < BLOCKSIZE; i++)
 			{
-                                for (j = 0; j < BLOCKSIZE; j++)
+				for (j = 0; j < BLOCKSIZE; j++)
 				{
-                                        rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
-                                }
-                        }
-                }
-        }
+					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
+				}
+			}
+		}
+	}
 
 	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
 	for (j = 0; j < size; j++)
@@ -310,62 +326,62 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(test_mat);
 
 	SSYRK("L", "N", size, size, 1.0f,
-				rmat, size, 0.0f, test_mat, size);
+			rmat, size, 0.0f, test_mat, size);
 
 	fprintf(stderr, "[%d] comparing results ...\n", rank);
-        if (display)
+	if (display)
 	{
-                for (j = 0; j < size; j++)
+		for (j = 0; j < size; j++)
 		{
-                        for (i = 0; i < size; i++)
+			for (i = 0; i < size; i++)
 			{
-                                if (i <= j)
+				if (i <= j)
 				{
-                                        printf("%2.2f\t", test_mat[j +i*size]);
-                                }
-                                else
+					printf("%2.2f\t", test_mat[j +i*size]);
+				}
+				else
 				{
-                                        printf(".\t");
-                                }
-                        }
-                        printf("\n");
-                }
-        }
+					printf(".\t");
+				}
+			}
+			printf("\n");
+		}
+	}
 
 	int correctness = 1;
-        for(x = 0; x < nblocks ;  x++)
+	for(x = 0; x < nblocks ;  x++)
 	{
-                for (y = 0; y < nblocks; y++)
+		for (y = 0; y < nblocks; y++)
 		{
-                        int mpi_rank = my_distrib(x, y, nodes);
-                        if (mpi_rank == rank)
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (mpi_rank == rank)
 			{
-                                for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
-                                {
-                                        for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
-                                        {
-                                                if (i <= j)
-                                                {
-                                                        float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-                                                        float err = abs(test_mat[j +i*size] - orig);
-                                                        if (err > 0.00001)
+				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
+				{
+					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
+					{
+						if (i <= j)
+						{
+							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+							float err = abs(test_mat[j +i*size] - orig);
+							if (err > 0.00001)
 							{
-                                                                fprintf(stderr, "[%d] Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+								fprintf(stderr, "[%d] Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
 								correctness = 0;
 								break;
-                                                        }
-                                                }
-                                        }
-                                }
-                        }
-                }
-        }
-
-        for(x=0 ; x<nblocks ; x++)
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	for(x=0 ; x<nblocks ; x++)
 	{
-                for(y=0 ; y<nblocks ; y++)
+		for(y=0 ; y<nblocks ; y++)
 		{
-                        starpu_free((void *)bmat[x][y]);
+			starpu_free((void *)bmat[x][y]);
 		}
 		free(bmat[x]);
 	}
@@ -373,7 +389,7 @@ int main(int argc, char **argv)
 	free(rmat);
 	free(test_mat);
 
-        starpu_helper_cublas_shutdown();
+	starpu_helper_cublas_shutdown();
 	starpu_shutdown();
 
 	assert(correctness);

+ 16 - 2
mpi/examples/cholesky/mpi_cholesky.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -34,9 +34,11 @@
 
 static unsigned size = 4*1024;
 static unsigned nblocks = 16;
-static unsigned nbigblocks = 8;
+static unsigned nbigblocks = 2;
 static unsigned noprio = 0;
 static unsigned display = 0;
+static unsigned dblockx = -1;
+static unsigned dblocky = -1;
 
 void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
@@ -59,6 +61,18 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 			size = strtol(argv[++i], &argptr, 10);
 		}
 
+		if (strcmp(argv[i], "-dblockx") == 0)
+		{
+		        char *argptr;
+			dblockx = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-dblocky") == 0)
+		{
+		        char *argptr;
+			dblocky = strtol(argv[++i], &argptr, 10);
+		}
+	
 		if (strcmp(argv[i], "-nblocks") == 0)
 		{
 		        char *argptr;

+ 221 - 222
mpi/starpu_mpi_insert_task.c

@@ -21,31 +21,30 @@
 #include <starpu.h>
 #include <starpu_data.h>
 #include <common/utils.h>
-#include <starpu_hash.h>
-#include <common/htable32.h>
+#include <common/htable64.h>
 #include <util/starpu_insert_task_utils.h>
 #include <datawizard/coherency.h>
 
-//#define STARPU_MPI_VERBOSE	1
+//#define STARPU_MPI_VERBOSE 1
 #include <starpu_mpi_private.h>
 
 /* Whether we are allowed to keep copies of remote data. Does not work
  * yet: the sender has to know whether the receiver has it, keeping it
  * in an array indexed by node numbers. */
-//#define MPI_CACHE
+//#define MPI_CACHE 1
 #include <starpu_mpi_insert_task_cache.h>
 
 static void _starpu_mpi_tables_init()
 {
-        if (sent_data == NULL) {
-                int nb_nodes;
+	if (sent_data == NULL) {
+		int nb_nodes;
 		int i;
 
-                MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
-		_STARPU_MPI_DEBUG("Initialising hash table for cache\n");
-		sent_data = malloc(nb_nodes * sizeof(struct starpu_htbl32_node *));
+		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
+		_STARPU_MPI_DEBUG("Initialising htable for cache\n");
+		sent_data = malloc(nb_nodes * sizeof(struct starpu_htbl64_node *));
 		for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
-		received_data = malloc(nb_nodes * sizeof(struct starpu_htbl32_node *));
+		received_data = malloc(nb_nodes * sizeof(struct starpu_htbl64_node *));
 		for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
 	}
 }
@@ -59,15 +58,15 @@ void _starpu_data_deallocate(starpu_data_handle_t data_handle)
 
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 {
-        int arg_type;
-        va_list varg_list;
-        int me, do_execute, xrank, nb_nodes;
+	int arg_type;
+	va_list varg_list;
+	int me, do_execute, xrank, nb_nodes;
 	size_t *size_on_nodes;
 	size_t arg_buffer_size = 0;
 	char *arg_buffer;
-        int dest=0, inconsistent_execute;
+	int dest=0, inconsistent_execute;
 
-        _STARPU_MPI_LOG_IN();
+	_STARPU_MPI_LOG_IN();
 
 	MPI_Comm_rank(comm, &me);
 	MPI_Comm_size(comm, &nb_nodes);
@@ -76,78 +75,78 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 	_starpu_mpi_tables_init();
 
-        /* Get the number of buffers and the size of the arguments */
+	/* Get the number of buffers and the size of the arguments */
 	va_start(varg_list, codelet);
-        arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
+	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
 
 	va_start(varg_list, codelet);
 	_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
 
 	/* Find out whether we are to execute the data because we own the data to be written to. */
-        inconsistent_execute = 0;
-        do_execute = -1;
+	inconsistent_execute = 0;
+	do_execute = -1;
 	xrank = -1;
 	va_start(varg_list, codelet);
 	while ((arg_type = va_arg(varg_list, int)) != 0) {
 		if (arg_type==STARPU_EXECUTE_ON_NODE) {
-                        xrank = va_arg(varg_list, int);
+			xrank = va_arg(varg_list, int);
 			_STARPU_MPI_DEBUG("Executing on node %d\n", xrank);
 			do_execute = 1;
-                }
+		}
 		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
-                        xrank = starpu_data_get_rank(data);
+			xrank = starpu_data_get_rank(data);
 			_STARPU_MPI_DEBUG("Executing on data node %d\n", xrank);
 			STARPU_ASSERT(xrank <= nb_nodes);
 			do_execute = 1;
-                }
+		}
 		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
-                        starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 
-                        if (data && arg_type & STARPU_R) {
+			if (data && arg_type & STARPU_R) {
 				int rank = starpu_data_get_rank(data);
 				struct starpu_data_interface_ops *ops;
 				ops = data->ops;
 				size_on_nodes[rank] += ops->get_size(data);
 			}
 
-                        if (arg_type & STARPU_W) {
-                                if (!data) {
-                                        /* We don't have anything allocated for this.
-                                         * The application knows we won't do anything
-                                         * about this task */
-                                        /* Yes, the app could actually not call
-                                         * insert_task at all itself, this is just a
-                                         * safeguard. */
-                                        _STARPU_MPI_DEBUG("oh oh\n");
-                                        _STARPU_MPI_LOG_OUT();
+			if (arg_type & STARPU_W) {
+				if (!data) {
+					/* We don't have anything allocated for this.
+					 * The application knows we won't do anything
+					 * about this task */
+					/* Yes, the app could actually not call
+					 * insert_task at all itself, this is just a
+					 * safeguard. */
+					_STARPU_MPI_DEBUG("oh oh\n");
+					_STARPU_MPI_LOG_OUT();
 					free(size_on_nodes);
-                                        return -EINVAL;
-                                }
-                                int mpi_rank = starpu_data_get_rank(data);
-                                if (mpi_rank == me) {
-                                        if (do_execute == 0) {
-                                                inconsistent_execute = 1;
-                                        }
-                                        else {
-                                                do_execute = 1;
-                                        }
-                                }
-                                else if (mpi_rank != -1) {
-                                        if (do_execute == 1) {
-                                                inconsistent_execute = 1;
-                                        }
-                                        else {
-                                                do_execute = 0;
-                                                dest = mpi_rank;
-                                                /* That's the rank which needs the data to be sent to */
-                                        }
-                                }
-                                else {
-                                        _STARPU_ERROR("rank invalid\n");
-                                }
-                        }
-                }
+					return -EINVAL;
+				}
+				int mpi_rank = starpu_data_get_rank(data);
+				if (mpi_rank == me) {
+					if (do_execute == 0) {
+						inconsistent_execute = 1;
+					}
+					else {
+						do_execute = 1;
+					}
+				}
+				else if (mpi_rank != -1) {
+					if (do_execute == 1) {
+						inconsistent_execute = 1;
+					}
+					else {
+						do_execute = 0;
+						dest = mpi_rank;
+						/* That's the rank which needs the data to be sent to */
+					}
+				}
+				else {
+					_STARPU_ERROR("rank invalid\n");
+				}
+			}
+		}
 		else if (arg_type==STARPU_VALUE) {
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
@@ -193,70 +192,69 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 	STARPU_ASSERT(do_execute != -1 && "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
 
-        if (inconsistent_execute == 1) {
-                if (xrank == -1) {
-                        _STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
+	if (inconsistent_execute == 1) {
+		if (xrank == -1) {
+			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
 			free(size_on_nodes);
 			return -EINVAL;
-                }
-                else {
-                        do_execute = (me == xrank);
-                        dest = xrank;
-                }
-        }
+		}
+		else {
+			do_execute = (me == xrank);
+			dest = xrank;
+		}
+	}
 	else if (xrank != -1) {
 		do_execute = (me == xrank);
 		dest = xrank;
 	}
 
-        /* Send and receive data as requested */
+	/* Send and receive data as requested */
 	va_start(varg_list, codelet);
 	while ((arg_type = va_arg(varg_list, int)) != 0) {
 		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
-                        starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
-                        if (data && arg_type & STARPU_R) {
-                                int mpi_rank = starpu_data_get_rank(data);
+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+			if (data && arg_type & STARPU_R) {
+				int mpi_rank = starpu_data_get_rank(data);
 				int mpi_tag = starpu_data_get_tag(data);
 				STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
-                                /* The task needs to read this data */
-                                if (do_execute && mpi_rank != me && mpi_rank != -1) {
-                                        /* I will have to execute but I don't have the data, receive */
+				/* The task needs to read this data */
+				if (do_execute && mpi_rank != me && mpi_rank != -1) {
+					/* I will have to execute but I don't have the data, receive */
 #ifdef MPI_CACHE
-                                        uint32_t key = starpu_crc32_be((uintptr_t)data, 0);
-                                        void *already_received = _starpu_htbl_search_32(received_data[mpi_rank], key);
-                                        if (!already_received) {
-                                                _starpu_htbl_insert_32(&received_data[mpi_rank], key, data);
-                                        }
-                                        else {
-                                                _STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
-                                        }
-                                        if (!already_received)
+					void *already_received = _starpu_htbl_search_64(received_data[mpi_rank], (uintptr_t) data);
+					if (!already_received) {
+						_starpu_htbl_insert_64(&received_data[mpi_rank], (uintptr_t) data, data);
+					}
+					else {
+						_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
+					}
+					if (!already_received)
 #endif
 					{
 						_STARPU_MPI_DEBUG("Receive data %p from %d\n", data, mpi_rank);
 						starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
 					}
-                                }
-                                if (!do_execute && mpi_rank == me) {
-                                        /* Somebody else will execute it, and I have the data, send it. */
+				}
+				if (!do_execute && mpi_rank == me) {
+					/* Somebody else will execute it, and I have the data, send it. */
 #ifdef MPI_CACHE
-                                        uint32_t key = starpu_crc32_be((uintptr_t)data, 0);
-                                        void *already_sent = _starpu_htbl_search_32(sent_data[dest], key);
-                                        if (!already_sent) {
-                                                _starpu_htbl_insert_32(&sent_data[dest], key, data);
-                                        }
-                                        else {
-                                                _STARPU_MPI_DEBUG("Do not sent data %p to node %d as it has already been sent\n", data, dest);
-                                        }
-                                        if (!already_sent)
+					void *already_sent = _starpu_htbl_search_64(sent_data[dest], (uintptr_t) data);
+
+					if (!already_sent) {
+						_starpu_htbl_insert_64(&sent_data[dest], (uintptr_t) data, data);
+					}
+					else {
+						_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
+					}
+					if (!already_sent)
 #endif
 					{
 						_STARPU_MPI_DEBUG("Send data %p to %d\n", data, dest);
 						starpu_mpi_isend_detached(data, dest, mpi_tag, comm, NULL, NULL);
 					}
-                                }
-                        }
-                }
+				}
+			}
+		}
 		else if (arg_type==STARPU_VALUE) {
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
@@ -280,107 +278,107 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
 			va_arg(varg_list, starpu_data_handle_t);
 		}
-        }
+	}
 	va_end(varg_list);
 
 	if (do_execute) {
-                _STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
-                va_start(varg_list, codelet);
-                struct starpu_task *task = starpu_task_create();
-                int ret = _starpu_insert_task_create_and_submit(arg_buffer, codelet, &task, varg_list);
-                _STARPU_MPI_DEBUG("ret: %d\n", ret);
-                STARPU_ASSERT(ret==0);
-        }
-
-        if (inconsistent_execute) {
-                va_start(varg_list, codelet);
-                while ((arg_type = va_arg(varg_list, int)) != 0) {
-                        if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
-                                starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
-                                if (arg_type & STARPU_W) {
-                                        int mpi_rank = starpu_data_get_rank(data);
+		_STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
+		va_start(varg_list, codelet);
+		struct starpu_task *task = starpu_task_create();
+		int ret = _starpu_insert_task_create_and_submit(arg_buffer, codelet, &task, varg_list);
+		_STARPU_MPI_DEBUG("ret: %d\n", ret);
+		STARPU_ASSERT(ret==0);
+	}
+
+	if (inconsistent_execute) {
+		va_start(varg_list, codelet);
+		while ((arg_type = va_arg(varg_list, int)) != 0) {
+			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
+				starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+				if (arg_type & STARPU_W) {
+					int mpi_rank = starpu_data_get_rank(data);
 					int mpi_tag = starpu_data_get_tag(data);
 					STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
-                                        if (mpi_rank == me) {
-                                                if (xrank != -1 && me != xrank) {
-                                                        _STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
-                                                        starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
-                                                }
-                                        }
-                                        else if (do_execute) {
-                                                _STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
-                                                starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
-                                        }
-                                }
-                        }
-                        else if (arg_type==STARPU_VALUE) {
-                                va_arg(varg_list, void *);
+					if (mpi_rank == me) {
+						if (xrank != -1 && me != xrank) {
+							_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
+							starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
+						}
+					}
+					else if (do_execute) {
+						_STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
+						starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
+					}
+				}
+			}
+			else if (arg_type==STARPU_VALUE) {
+				va_arg(varg_list, void *);
 				va_arg(varg_list, size_t);
-                        }
-                        else if (arg_type==STARPU_CALLBACK) {
-                                va_arg(varg_list, void (*)(void *));
-                        }
-                        else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
-                                va_arg(varg_list, void (*)(void *));
-                                va_arg(varg_list, void *);
-                        }
-                        else if (arg_type==STARPU_CALLBACK_ARG) {
-                                va_arg(varg_list, void *);
-                        }
-                        else if (arg_type==STARPU_PRIORITY) {
-                                va_arg(varg_list, int);
-                        }
-                        else if (arg_type==STARPU_EXECUTE_ON_NODE) {
-                                va_arg(varg_list, int);
-                        }
-                        else if (arg_type==STARPU_EXECUTE_ON_DATA) {
-                                va_arg(varg_list, starpu_data_handle_t);
-                        }
-                }
-                va_end(varg_list);
-        }
+			}
+			else if (arg_type==STARPU_CALLBACK) {
+				va_arg(varg_list, void (*)(void *));
+			}
+			else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+				va_arg(varg_list, void (*)(void *));
+				va_arg(varg_list, void *);
+			}
+			else if (arg_type==STARPU_CALLBACK_ARG) {
+				va_arg(varg_list, void *);
+			}
+			else if (arg_type==STARPU_PRIORITY) {
+				va_arg(varg_list, int);
+			}
+			else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+				va_arg(varg_list, int);
+			}
+			else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+				va_arg(varg_list, starpu_data_handle_t);
+			}
+		}
+		va_end(varg_list);
+	}
 
 	va_start(varg_list, codelet);
 	while ((arg_type = va_arg(varg_list, int)) != 0) {
 		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
-                        starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 #ifdef MPI_CACHE
-                        if (arg_type & STARPU_W) {
-                                uint32_t key = starpu_crc32_be((uintptr_t)data, 0);
-                                if (do_execute) {
-                                        /* Note that all copies I've sent to neighbours are now invalid */
-                                        int n, size;
-                                        MPI_Comm_size(comm, &size);
-                                        for(n=0 ; n<size ; n++) {
-                                                void *already_sent = _starpu_htbl_search_32(sent_data[n], key);
-                                                if (already_sent) {
-                                                        _STARPU_MPI_DEBUG("Posting request to clear send cache for data %p\n", data);
-                                                        _starpu_mpi_clear_cache_request(data, n, _STARPU_MPI_CLEAR_SENT_DATA);
-                                                }
-                                        }
-                                }
-                                else {
-                                        int mpi_rank = starpu_data_get_rank(data);
-                                        void *already_received = _starpu_htbl_search_32(received_data[mpi_rank], key);
-                                        if (already_received) {
-                                                /* Somebody else will write to the data, so discard our cached copy if any */
-                                                /* TODO: starpu_mpi could just remember itself. */
-                                                _STARPU_MPI_DEBUG("Posting request to clear receive cache for data %p\n", data);
-                                                _starpu_mpi_clear_cache_request(data, mpi_rank, _STARPU_MPI_CLEAR_RECEIVED_DATA);
-                                                _starpu_data_deallocate(data);
-                                        }
-                                }
-                        }
+			if (arg_type & STARPU_W) {
+				if (do_execute) {
+					/* Note that all copies I've sent to neighbours are now invalid */
+					int n, size;
+					MPI_Comm_size(comm, &size);
+					for(n=0 ; n<size ; n++) {
+						void *already_sent = _starpu_htbl_search_64(sent_data[n], (uintptr_t) data);
+
+						if (already_sent) {
+							_STARPU_MPI_DEBUG("Posting request to clear send cache for data %p\n", data);
+							_starpu_mpi_clear_cache_request(data, n, _STARPU_MPI_CLEAR_SENT_DATA);
+						}
+					}
+				}
+				else {
+					int mpi_rank = starpu_data_get_rank(data);
+					void *already_received=_starpu_htbl_search_64(received_data[mpi_rank], (uintptr_t) data);
+					if (already_received) {
+						/* Somebody else will write to the data, so discard our cached copy if any */
+						/* TODO: starpu_mpi could just remember itself. */
+						_STARPU_MPI_DEBUG("Posting request to clear receive cache for data %p\n", data);
+						_starpu_mpi_clear_cache_request(data, mpi_rank, _STARPU_MPI_CLEAR_RECEIVED_DATA);
+						_starpu_data_deallocate(data);
+					}
+				}
+			}
 #else
-                        /* We allocated a temporary buffer for the received data, now drop it */
-                        if ((arg_type & STARPU_R) && do_execute) {
-                                int mpi_rank = starpu_data_get_rank(data);
-                                if (mpi_rank != me && mpi_rank != -1) {
-                                        _starpu_data_deallocate(data);
-                                }
-                        }
+			/* We allocated a temporary buffer for the received data, now drop it */
+			if ((arg_type & STARPU_R) && do_execute) {
+				int mpi_rank = starpu_data_get_rank(data);
+				if (mpi_rank != me && mpi_rank != -1) {
+					_starpu_data_deallocate(data);
+				}
+			}
 #endif
-                }
+		}
 		else if (arg_type==STARPU_VALUE) {
 			va_arg(varg_list, void *);
 			va_arg(varg_list, size_t);
@@ -404,59 +402,60 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
 			va_arg(varg_list, starpu_data_handle_t);
 		}
-        }
+	}
+
 	va_end(varg_list);
-        _STARPU_MPI_LOG_OUT();
-        return 0;
+	_STARPU_MPI_LOG_OUT();
+	return 0;
 }
 
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
 {
-        int me, rank, tag;
+	int me, rank, tag;
 
-        rank = starpu_data_get_rank(data_handle);
-        tag = starpu_data_get_tag(data_handle);
+	rank = starpu_data_get_rank(data_handle);
+	tag = starpu_data_get_tag(data_handle);
 	MPI_Comm_rank(comm, &me);
 
-        if (node == rank) return;
+	if (node == rank) return;
 
-        if (me == node)
-        {
+	if (me == node)
+	{
 		starpu_mpi_irecv_detached(data_handle, rank, tag, comm, callback, arg);
-        }
-        else if (me == rank)
-        {
+	}
+	else if (me == rank)
+	{
 		starpu_mpi_isend_detached(data_handle, node, tag, comm, NULL, NULL);
-        }
+	}
 }
 
 void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
 {
-        int me, rank, tag;
+	int me, rank, tag;
 
-        rank = starpu_data_get_rank(data_handle);
-        tag = starpu_data_get_tag(data_handle);
+	rank = starpu_data_get_rank(data_handle);
+	tag = starpu_data_get_tag(data_handle);
 	MPI_Comm_rank(comm, &me);
 
-        if (node == rank) return;
-
-        if (me == node)
-        {
-                MPI_Status status;
-                starpu_mpi_recv(data_handle, rank, tag, comm, &status);
-        }
-        else if (me == rank)
-        {
-                starpu_mpi_send(data_handle, node, tag, comm);
-        }
+	if (node == rank) return;
+
+	if (me == node)
+	{
+		MPI_Status status;
+		starpu_mpi_recv(data_handle, rank, tag, comm, &status);
+	}
+	else if (me == rank)
+	{
+		starpu_mpi_send(data_handle, node, tag, comm);
+	}
 }
 
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 {
-        int me, rank, tag, nb_nodes;
+	int me, rank, tag, nb_nodes;
 
-        rank = starpu_data_get_rank(data_handle);
-        tag = starpu_data_get_tag(data_handle);
+	rank = starpu_data_get_rank(data_handle);
+	tag = starpu_data_get_tag(data_handle);
 
 	MPI_Comm_rank(comm, &me);
 	MPI_Comm_size(comm, &nb_nodes);
@@ -477,9 +476,9 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 
 				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
 				starpu_insert_task(data_handle->redux_cl,
-						   STARPU_RW, data_handle,
-						   STARPU_R, new_handle,
-						   0);
+						STARPU_RW, data_handle,
+						STARPU_R, new_handle,
+						0);
 			}
 		}
 	}

+ 6 - 8
mpi/starpu_mpi_insert_task_cache.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,8 +17,7 @@
 
 #include <starpu_mpi_private.h>
 #include <starpu_mpi_insert_task_cache.h>
-#include <starpu_hash.h>
-#include <common/htable32.h>
+#include <common/htable64.h>
 
 typedef struct _starpu_mpi_clear_cache_s {
         starpu_data_handle_t data;
@@ -26,21 +25,20 @@ typedef struct _starpu_mpi_clear_cache_s {
         int mode;
 } _starpu_mpi_clear_cache_t;
 
-struct starpu_htbl32_node **sent_data = NULL;
-struct starpu_htbl32_node **received_data = NULL;
+struct starpu_htbl64_node **sent_data = NULL;
+struct starpu_htbl64_node **received_data = NULL;
 
 void _starpu_mpi_clear_cache_callback(void *callback_arg)
 {
         _starpu_mpi_clear_cache_t *clear_cache = (_starpu_mpi_clear_cache_t *)callback_arg;
-        uint32_t key = starpu_crc32_be((uintptr_t)clear_cache->data, 0);
 
         if (clear_cache->mode == _STARPU_MPI_CLEAR_SENT_DATA) {
                 _STARPU_MPI_DEBUG("Clearing sent cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
-                _starpu_htbl_insert_32(&sent_data[clear_cache->rank], key, NULL);
+                _starpu_htbl_insert_64(&sent_data[clear_cache->rank], (uintptr_t)clear_cache->data, NULL);
         }
         else if (clear_cache->mode == _STARPU_MPI_CLEAR_RECEIVED_DATA) {
                 _STARPU_MPI_DEBUG("Clearing received cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
-                _starpu_htbl_insert_32(&received_data[clear_cache->rank], key, NULL);
+                _starpu_htbl_insert_64(&received_data[clear_cache->rank], (uintptr_t)clear_cache->data, NULL);
         }
 
         free(clear_cache);

+ 3 - 2
mpi/starpu_mpi_insert_task_cache.h

@@ -16,11 +16,12 @@
  */
 
 #include <starpu.h>
+#include <common/list.h>
 
 #define _STARPU_MPI_CLEAR_SENT_DATA     0
 #define _STARPU_MPI_CLEAR_RECEIVED_DATA 1
 
-extern struct starpu_htbl32_node **sent_data;
-extern struct starpu_htbl32_node **received_data;
+extern struct starpu_htbl64_node **sent_data;
+extern struct starpu_htbl64_node **received_data;
 
 void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode);

+ 1 - 1
socl/src/cl_createbuffer.c

@@ -26,7 +26,7 @@ static void release_callback_memobject(void * e) {
   mem_object_release(mem);
 
   /* Destruct object */
-  starpu_data_unregister_no_coherency(mem->handle);
+  starpu_data_unregister_lazy(mem->handle);
 
   if (!(mem->flags & CL_MEM_USE_HOST_PTR))
     free(mem->ptr);

+ 2 - 2
socl/src/devices.c

@@ -18,9 +18,9 @@
 #include "devices.h"
 
 // OpenCL 1.0 : Mandatory format: major_number.minor_number
-const char * SOCL_DRIVER_VERSION = "0.1";
+const char * __attribute__ ((aligned (16))) SOCL_DRIVER_VERSION = "0.1";
 
-const cl_uint SOCL_DEVICE_VENDOR_ID = 666;
+const cl_uint __attribute__ ((aligned (16))) SOCL_DEVICE_VENDOR_ID = 666;
 
 const struct _cl_device_id socl_devices[] = {
    { 

+ 6 - 6
socl/src/socl.c

@@ -18,15 +18,15 @@
 
 struct _cl_platform_id socl_platform = {};
 
-const char * SOCL_PROFILE = "FULL_PROFILE";
-const char * SOCL_VERSION = "OpenCL 1.0 StarPU Edition (0.0.1)";
-const char * SOCL_PLATFORM_NAME    = "StarPU Platform";
-const char * SOCL_VENDOR  = "INRIA";
-const char * SOCL_PLATFORM_EXTENSIONS = "";
+const char * __attribute__ ((aligned (16))) SOCL_PROFILE = "FULL_PROFILE" ;
+const char * __attribute__ ((aligned (16))) SOCL_VERSION = "OpenCL 1.0 StarPU Edition (0.0.1)";
+const char * __attribute__ ((aligned (16))) SOCL_PLATFORM_NAME    = "StarPU Platform";
+const char * __attribute__ ((aligned (16))) SOCL_VENDOR  = "INRIA";
+const char * __attribute__ ((aligned (16))) SOCL_PLATFORM_EXTENSIONS = "";
 
 
 /* Command queues with profiling enabled
  * This allows us to disable StarPU profiling it
  * is equal to 0
  */
-int profiling_queue_count = 0;
+int __attribute__ ((aligned (16))) profiling_queue_count = 0;

+ 2 - 0
src/Makefile.am

@@ -92,6 +92,7 @@ noinst_HEADERS = 						\
 	common/barrier.h					\
 	common/timing.h						\
 	common/htable32.h					\
+	common/htable64.h					\
 	common/list.h						\
 	common/rwlock.h						\
 	common/starpu_spinlock.h				\
@@ -123,6 +124,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	common/barrier_counter.c				\
 	common/hash.c 						\
 	common/htable32.c					\
+	common/htable64.c					\
 	common/rwlock.c						\
 	common/starpu_spinlock.c				\
 	common/timing.c						\

+ 1 - 1
src/common/htable32.c

@@ -106,7 +106,7 @@ void *_starpu_htbl_insert_32(struct starpu_htbl32_node **htbl, uint32_t key, voi
 static void _starpu_htbl_destroy_32_bit(struct starpu_htbl32_node *htbl, unsigned bit, void (*remove)(void*))
 {
 	unsigned keysize = sizeof(uint32_t)*8;
-	int i;
+	unsigned i;
 
 	if (!htbl)
 		return;

+ 2 - 2
src/common/htable32.h

@@ -23,9 +23,9 @@
 #include <stdio.h>
 #include <assert.h>
 
-#define _STARPU_HTBL32_NODE_SIZE	16
+#define _STARPU_HTBL32_NODE_SIZE	8
 
-/* Hierarchical table: all nodes have a 2^16 arity . */
+/* Hierarchical table: all nodes have a 2^8 arity . */
 /* Note: this struct is used in include/starpu_perfmodel.h */
 struct starpu_htbl32_node {
 	unsigned nentries;

+ 106 - 0
src/common/htable64.c

@@ -0,0 +1,106 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/config.h>
+#include <common/htable64.h>
+#include <stdint.h>
+#include <string.h>
+
+void *_starpu_htbl_search_64(struct starpu_htbl64_node *htbl, uint64_t key)
+{
+	unsigned currentbit;
+	unsigned keysize = sizeof(uint64_t)*8;
+
+	struct starpu_htbl64_node *current_htbl = htbl;
+	uint64_t mask = (1ULL<<_STARPU_HTBL64_NODE_SIZE)-1;
+
+	for(currentbit = 0; currentbit < keysize; currentbit+=_STARPU_HTBL64_NODE_SIZE)
+	{
+		if (STARPU_UNLIKELY(current_htbl == NULL))
+			return NULL;
+
+		unsigned last_currentbit =
+			keysize - (currentbit + _STARPU_HTBL64_NODE_SIZE);
+		uint64_t offloaded_mask = mask << last_currentbit;
+		unsigned current_index =
+			(key & (offloaded_mask)) >> (last_currentbit);
+
+		current_htbl = current_htbl->children[current_index];
+	}
+	return current_htbl;
+}
+
+/*
+ * returns the previous value of the tag, or NULL else
+ */
+
+void *_starpu_htbl_insert_64(struct starpu_htbl64_node **htbl, uint64_t key, void *entry)
+{
+	unsigned currentbit;
+	unsigned keysize = sizeof(uint64_t)*8;
+	struct starpu_htbl64_node **current_htbl_ptr = htbl;
+
+	uint64_t mask = (1ULL<<_STARPU_HTBL64_NODE_SIZE)-1;
+	for(currentbit = 0; currentbit < keysize; currentbit+=_STARPU_HTBL64_NODE_SIZE)
+	{
+		if (*current_htbl_ptr == NULL)
+		{
+			*current_htbl_ptr = (struct starpu_htbl64_node*)calloc(sizeof(struct starpu_htbl64_node), 1);
+			STARPU_ASSERT(*current_htbl_ptr);
+		}
+
+		unsigned last_currentbit =
+			keysize - (currentbit + _STARPU_HTBL64_NODE_SIZE);
+		uint64_t offloaded_mask = mask << last_currentbit;
+		unsigned current_index =
+			(key & (offloaded_mask)) >> (last_currentbit);
+
+		current_htbl_ptr =
+			&((*current_htbl_ptr)->children[current_index]);
+	}
+	void *old_entry = *current_htbl_ptr;
+	*current_htbl_ptr = (struct starpu_htbl64_node *) entry;
+
+	return old_entry;
+}
+
+static void _starpu_htbl_destroy_64_bit(struct starpu_htbl64_node *htbl, unsigned bit, void (*remove)(void*))
+{
+	unsigned keysize = sizeof(uint64_t)*8;
+	unsigned i;
+
+	if (!htbl)
+		return;
+
+	if (bit >= keysize) {
+		/* entry, delete it */
+		if (remove)
+			remove(htbl);
+		return;
+	}
+
+	for (i = 0; i < 1ULL<<_STARPU_HTBL64_NODE_SIZE; i++) {
+		_starpu_htbl_destroy_64_bit(htbl->children[i], bit+_STARPU_HTBL64_NODE_SIZE, remove);
+	}
+
+	free(htbl);
+}
+void _starpu_htbl_destroy_64(struct starpu_htbl64_node *htbl, void (*remove)(void*))
+{
+	_starpu_htbl_destroy_64_bit(htbl, 0, remove);
+}

+ 46 - 0
src/common/htable64.h

@@ -0,0 +1,46 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __GENERIC_HTABLE_H__
+#define __GENERIC_HTABLE_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define _STARPU_HTBL64_NODE_SIZE	8 
+
+/* Hierarchical table: all nodes have a 2^8 arity . */
+struct starpu_htbl64_node {
+	unsigned nentries;
+	struct starpu_htbl64_node *children[1ULL<<_STARPU_HTBL64_NODE_SIZE];
+};
+
+/* Look for a 64bit key into the hierchical table. Returns the entry if
+ * something is found, NULL otherwise. */
+void *_starpu_htbl_search_64(struct starpu_htbl64_node *htbl, uint64_t key);
+
+/* Insert an entry indexed by the 64bit key into the hierarchical table.
+ * Returns the entry that was previously associated to that key if any, NULL
+ * otherwise. */
+void *_starpu_htbl_insert_64(struct starpu_htbl64_node **htbl, uint64_t key, void *entry);
+
+/* Delete the content of the table, `remove' being called on each element */
+void _starpu_htbl_destroy_64(struct starpu_htbl64_node *htbl, void (*remove)(void*));
+
+#endif // __GENERIC_HTABLE_H__

+ 2 - 2
src/core/dependencies/htable.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,7 +28,7 @@
 #include <assert.h>
 #include <core/dependencies/tags.h>
 
-#define _STARPU_HTBL_NODE_SIZE	16
+#define _STARPU_HTBL_NODE_SIZE	8
 
 struct _starpu_htbl_node
 {

+ 12 - 0
src/core/jobs.c

@@ -163,6 +163,18 @@ void _starpu_handle_job_termination(struct _starpu_job *j, int workerid)
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
+	/* We release handle reference count */
+	if (task->cl) {
+		unsigned i;
+		for (i=0; i<task->cl->nbuffers; i++) {
+			starpu_data_handle_t handle = task->handles[i];
+			_starpu_spin_lock(&handle->header_lock);
+			handle->busy_count--;
+			if (!_starpu_data_check_not_busy(handle))
+				_starpu_spin_unlock(&handle->header_lock);
+		}
+	}
+
 	/* Tell other tasks that we don't exist any more, thus no need for
 	 * implicit dependencies any more.  */
 	_starpu_release_task_enforce_sequential_consistency(j);

+ 2 - 2
src/core/perfmodel/perfmodel.c

@@ -225,7 +225,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 	unsigned i;
 	int err;
 	double sum = 0.0;
-	int node, cpu_node;
+	int node;
 
 	/* We need to get one node per archtype. This is kinda ugly,
 	 * but it does the job.
@@ -233,7 +233,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 	 * (err != 1 && err != -ERANGE)
 	 */
 #ifdef STARPU_USE_CPU
-	int cpu_worker;
+	int cpu_worker, cpu_node;
 	err = starpu_worker_get_ids_by_type(STARPU_CPU_WORKER,
 					    &cpu_worker, 1);
 	if (err != 1 && err != -ERANGE)

+ 54 - 15
src/core/perfmodel/perfmodel_bus.c

@@ -363,7 +363,7 @@ static int find_numa_node(hwloc_obj_t obj)
 }
 #endif
 
-static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char type)
+static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char *type)
 {
 	/* Either we have hwloc and we measure the bandwith between each GPU
 	 * and each NUMA node, or we don't have such NUMA information and we
@@ -423,11 +423,11 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 #endif
 
 #ifdef STARPU_USE_CUDA
-                if (type == 'C')
+                if (strncmp(type, "CUDA", 4) == 0)
                         measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
 #endif
 #ifdef STARPU_USE_OPENCL
-                if (type == 'O')
+                if (strncmp(type, "OpenCL", 6) == 0)
                         measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
 #endif
 
@@ -456,7 +456,7 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 }
 
 static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
-                                                   struct dev_timing *dev_timing_per_cpu, char type)
+                                                   struct dev_timing *dev_timing_per_cpu, char *type)
 {
 	measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
 
@@ -465,6 +465,9 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
               sizeof(struct dev_timing),
 			compar_dev_timing);
 
+#ifdef STARPU_DEVEL
+#  warning save timing_dtoh and timing_htod data to display them when calling starpu_machine_display ? (Brice would like that)
+#endif
 #ifdef STARPU_VERBOSE
         unsigned cpu;
 	for (cpu = 0; cpu < ncpus; cpu++)
@@ -475,12 +478,12 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 
-		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
+		_STARPU_DISP("(%10s) BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", type, dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 	}
 
 	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
 
-	_STARPU_DISP("BANDWIDTH GPU %d BEST CPU %u\n", dev, best_cpu);
+	_STARPU_DISP("(%10s) BANDWIDTH GPU %d BEST CPU %u\n", type, dev, best_cpu);
 #endif
 
 	/* The results are sorted in a decreasing order, so that the best
@@ -529,16 +532,16 @@ static void benchmark_all_gpu_devices(void)
 	ncuda = _starpu_get_cuda_device_count();
 	for (i = 0; i < ncuda; i++)
 	{
-		fprintf(stderr," CUDA %d...", i);
+		_STARPU_DISP("CUDA %d...\n", i);
 		/* measure bandwidth between Host and Device i */
-		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
+		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, "CUDA");
 	}
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	for (i = 0; i < ncuda; i++)
 		for (j = 0; j < ncuda; j++)
 			if (i != j)
 			{
-				fprintf(stderr," CUDA %d -> %d...", i, j);
+				_STARPU_DISP("CUDA %d -> %d...\n", i, j);
 				/* measure bandwidth between Host and Device i */
 				measure_bandwidth_between_dev_and_dev_cuda(i, j);
 			}
@@ -548,9 +551,9 @@ static void benchmark_all_gpu_devices(void)
         nopencl = _starpu_opencl_get_device_count();
 	for (i = 0; i < nopencl; i++)
 	{
-		fprintf(stderr," OpenCL %d...", i);
+		_STARPU_DISP("OpenCL %d...\n", i);
 		/* measure bandwith between Host and Device i */
-		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
+		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, "OpenCL");
 	}
 #endif
 
@@ -579,7 +582,6 @@ static void benchmark_all_gpu_devices(void)
 static void get_bus_path(const char *type, char *path, size_t maxlen)
 {
 	_starpu_get_perf_model_dir_bus(path, maxlen);
-	strncat(path, type, maxlen);
 
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
@@ -587,8 +589,9 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 		gethostname(hostname, sizeof(hostname));
-	strncat(path, ".", maxlen);
 	strncat(path, hostname, maxlen);
+	strncat(path, ".", maxlen);
+	strncat(path, type, maxlen);
 }
 
 /*
@@ -766,6 +769,42 @@ int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
 }
 #endif /* STARPU_USE_OPENCL */
 
+void starpu_bus_print_affinity(FILE *f)
+{
+	unsigned cpu;
+	int gpu;
+
+	fprintf(f, "# GPU\t");
+	for(cpu = 0 ; cpu < ncpus ; cpu++)
+	{
+		fprintf(f, "CPU%u\t", cpu);
+	}
+	fprintf(f, "\n");
+
+#ifdef STARPU_USE_CUDA
+	for(gpu = 0 ; gpu<ncuda ; gpu++)
+	{
+		fprintf(f, "%d\t", gpu);
+		for (cpu = 0; cpu < ncpus; cpu++)
+		{
+			fprintf(f, "%d\t", cuda_affinity_matrix[gpu][cpu]);
+		}
+		fprintf(f, "\n");
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+	for(gpu = 0 ; gpu<nopencl ; gpu++)
+	{
+		fprintf(f, "%d\t", gpu);
+		for (cpu = 0; cpu < ncpus; cpu++)
+		{
+			fprintf(f, "%d\t", opencl_affinity_matrix[gpu][cpu]);
+		}
+		fprintf(f, "\n");
+	}
+#endif
+}
+
 /*
  *	Latency
  */
@@ -1114,9 +1153,9 @@ static void check_bus_config_file()
         res = access(path, F_OK);
         if (res)
 	{
-		fprintf(stderr, "No performance model for the bus, calibrating...");
+		_STARPU_DISP("No performance model for the bus, calibrating...\n");
 		starpu_force_bus_sampling();
-		fprintf(stderr, "done\n");
+		_STARPU_DISP("... done\n");
         }
         else
 	{

+ 12 - 2
src/core/perfmodel/perfmodel_history.c

@@ -219,7 +219,7 @@ static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_hi
 		_STARPU_DEBUG("Parsing arch %u\n", arch);
 		_starpu_drop_comments(f);
 		ret = fscanf(f, "%d\n", &nimpls);
-		_STARPU_DEBUG("%u implementations\n", nimpls);
+		_STARPU_DEBUG("%d implementations\n", nimpls);
 		STARPU_ASSERT(ret == 1);
 		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
 		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
@@ -243,7 +243,7 @@ static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_hi
 		{
 			_STARPU_DEBUG("skipping arch %u\n", arch);
 			ret = fscanf(f, "%d\n", &nimpls);
-			_STARPU_DEBUG("%u implementations\n", nimpls);
+			_STARPU_DEBUG("%d implementations\n", nimpls);
 			STARPU_ASSERT(ret == 1);
 			implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
 			skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
@@ -1159,3 +1159,13 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 	}
 }
+
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perf_archtype arch, unsigned cpuid, unsigned nimpl, double measured) {
+	struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
+
+	_starpu_load_perfmodel(model);
+	/* Record measurement */
+	_starpu_update_perfmodel_history(job, model, arch, cpuid, measured, nimpl);
+	/* and save perfmodel on termination */
+	_starpu_set_calibrate_flag(1);
+}

+ 17 - 6
src/core/task.c

@@ -114,8 +114,10 @@ void starpu_task_deinit(struct starpu_task *task)
 
 	struct _starpu_job *j = (struct _starpu_job *)task->starpu_private;
 
-	if (j)
+	if (j) {
 		_starpu_job_destroy(j);
+		task->starpu_private = NULL;
+	}
 }
 
 struct starpu_task * __attribute__((malloc)) starpu_task_create(void)
@@ -215,6 +217,9 @@ struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task)
  * already counted. */
 int _starpu_submit_job(struct _starpu_job *j)
 {
+
+	struct starpu_task *task = j->task;
+
         _STARPU_LOG_IN();
 	/* notify bound computation of a new task */
 	_starpu_bound_record(j);
@@ -232,6 +237,17 @@ int _starpu_submit_job(struct _starpu_job *j)
 	}
 #endif
 
+	/* We retain handle reference count */
+	if (task->cl) {
+		unsigned i;
+		for (i=0; i<task->cl->nbuffers; i++) {
+			starpu_data_handle_t handle = task->handles[i];
+			_starpu_spin_lock(&handle->header_lock);
+			handle->busy_count++;
+			_starpu_spin_unlock(&handle->header_lock);
+		}
+	}
+
 	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 	/* Need to atomically set submitted to 1 and check dependencies, since
@@ -665,11 +681,6 @@ starpu_drivers_request_termination(void)
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 }
 
-void _starpu_check_nsubmitted_tasks(void)
-{
-
-}
-
 static void _starpu_increment_nsubmitted_tasks(void)
 {
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);

+ 10 - 27
src/core/workers.c

@@ -28,6 +28,10 @@
 #include <profiling/profiling.h>
 #include <starpu_task_list.h>
 
+#include <drivers/cpu/driver_cpu.h>
+#include <drivers/cuda/driver_cuda.h>
+#include <drivers/opencl/driver_opencl.h>
+
 #ifdef __MINGW32__
 #include <windows.h>
 #endif
@@ -471,7 +475,11 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 	/* Note that starpu_get_env_number returns -1 in case the variable is
 	 * not defined */
-	conf->ncpus = starpu_get_env_number("STARPU_NCPUS");
+	/* Backward compatibility: check the value of STARPU_NCPUS if
+	 * STARPU_NCPU is not set. */
+	conf->ncpus = starpu_get_env_number("STARPU_NCPU");
+	if (conf->ncpus == -1)
+		conf->ncpus = starpu_get_env_number("STARPU_NCPUS");
 	conf->ncuda = starpu_get_env_number("STARPU_NCUDA");
 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
 	conf->nspus = starpu_get_env_number("STARPU_NGORDON");
@@ -514,6 +522,7 @@ static void _starpu_conf_check_environment(struct starpu_conf *conf)
 	}
 
 	_starpu_conf_set_value_against_environment("STARPU_NCPUS", &conf->ncpus);
+	_starpu_conf_set_value_against_environment("STARPU_NCPU", &conf->ncpus);
 	_starpu_conf_set_value_against_environment("STARPU_NCUDA", &conf->ncuda);
 	_starpu_conf_set_value_against_environment("STARPU_NOPENCL", &conf->nopencl);
 	_starpu_conf_set_value_against_environment("STARPU_NGORDON", &conf->nspus);
@@ -1115,16 +1124,6 @@ struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void)
 	return &config.sched_ctxs[0];
 }
 
-#ifdef STARPU_USE_CPU
-extern int _starpu_run_cpu(struct starpu_driver *);
-#endif
-#ifdef STARPU_USE_CUDA
-extern int _starpu_run_cuda(struct starpu_driver *);
-#endif
-#ifdef STARPU_USE_OPENCL
-extern int _starpu_run_opencl(struct starpu_driver *);
-#endif
-
 int
 starpu_driver_run(struct starpu_driver *d)
 {
@@ -1151,22 +1150,6 @@ starpu_driver_run(struct starpu_driver *d)
 	}
 }
 
-#ifdef STARPU_USE_CPU
-extern int _starpu_cpu_driver_init(struct starpu_driver *);
-extern int _starpu_cpu_driver_run_once(struct starpu_driver *);
-extern int _starpu_cpu_driver_deinit(struct starpu_driver *);
-#endif
-#ifdef STARPU_USE_CUDA
-extern int _starpu_cuda_driver_init(struct starpu_driver *);
-extern int _starpu_cuda_driver_run_once(struct starpu_driver *);
-extern int _starpu_cuda_driver_deinit(struct starpu_driver *);
-#endif
-#ifdef STARPU_USE_OPENCL
-extern int _starpu_opencl_driver_init(struct starpu_driver *);
-extern int _starpu_opencl_driver_run_once(struct starpu_driver *);
-extern int _starpu_opencl_driver_deinit(struct starpu_driver *);
-#endif
-
 int
 starpu_driver_init(struct starpu_driver *d)
 {

+ 5 - 1
src/drivers/cpu/driver_cpu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,5 +29,9 @@
 #include <starpu.h>
 
 void *_starpu_cpu_worker(void *);
+int _starpu_run_cpu(struct starpu_driver *);
+int _starpu_cpu_driver_init(struct starpu_driver *);
+int _starpu_cpu_driver_run_once(struct starpu_driver *);
+int _starpu_cpu_driver_deinit(struct starpu_driver *);
 
 #endif //  __DRIVER_CPU_H__

+ 6 - 1
src/drivers/cuda/driver_cuda.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,6 +44,11 @@ unsigned _starpu_get_cuda_device_count(void);
 void _starpu_init_cuda(void);
 void *_starpu_cuda_worker(void *);
 cudaStream_t starpu_cuda_get_local_transfer_stream(void);
+
+int _starpu_run_cuda(struct starpu_driver *);
+int _starpu_cuda_driver_init(struct starpu_driver *);
+int _starpu_cuda_driver_run_once(struct starpu_driver *);
+int _starpu_cuda_driver_deinit(struct starpu_driver *);
 #endif
 
 #endif //  __DRIVER_CUDA_H__

+ 13 - 1
src/drivers/opencl/driver_opencl.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -60,5 +60,17 @@ void _starpu_opencl_init(void);
 extern
 void *_starpu_opencl_worker(void *);
 
+extern
+int _starpu_run_opencl(struct starpu_driver *);
+
+extern
+int _starpu_opencl_driver_init(struct starpu_driver *);
+
+extern
+int _starpu_opencl_driver_run_once(struct starpu_driver *);
+
+extern
+int _starpu_opencl_driver_deinit(struct starpu_driver *);
+
 #endif // STARPU_USE_OPENCL
 #endif //  __DRIVER_OPENCL_H__

+ 6 - 2
src/drivers/opencl/driver_opencl_utils.c

@@ -189,6 +189,7 @@ char *_starpu_opencl_load_program_binary(const char *filename, size_t *len)
 	return binary;
 }
 
+static
 void _starpu_opencl_create_binary_directory(char *path, size_t maxlen)
 {
 	static int _directory_created = 0;
@@ -217,6 +218,7 @@ char *_starpu_opencl_get_device_type_as_string(int id)
 	}
 }
 
+static
 int _starpu_opencl_get_binary_name(char *binary_file_name, size_t maxlen, const char *source_file_name, int dev, cl_device_id device)
 {
 	char binary_directory[1024];
@@ -240,6 +242,7 @@ int _starpu_opencl_get_binary_name(char *binary_file_name, size_t maxlen, const
 	return CL_SUCCESS;
 }
 
+static
 int _starpu_opencl_compile_or_load_opencl_from_string(const char *opencl_program_source, const char* build_options,
 						      struct starpu_opencl_program *opencl_programs, const char* source_file_name)
 {
@@ -262,7 +265,7 @@ int _starpu_opencl_compile_or_load_opencl_from_string(const char *opencl_program
                 starpu_opencl_get_context(dev, &context);
                 if (context == NULL)
 		{
-                        _STARPU_DEBUG("[%d] is not a valid OpenCL context\n", dev);
+                        _STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev);
                         continue;
                 }
 
@@ -347,6 +350,7 @@ void starpu_opencl_load_program_source(const char *source_file_name, char *locat
 	sprintf(opencl_program_source, "%s", source);
 }
 
+static
 int _starpu_opencl_compile_or_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options)
 {
 	int nb_devices;
@@ -418,7 +422,7 @@ int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl
                 starpu_opencl_get_context(dev, &context);
                 if (context == NULL)
 		{
-                        _STARPU_DEBUG("[%d] is not a valid OpenCL context\n", dev);
+                        _STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev);
                         continue;
                 }
 

+ 5 - 1
tests/Makefile.am

@@ -218,7 +218,8 @@ noinst_PROGRAMS =				\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels_spmd	\
 	perfmodels/regression_based		\
-	perfmodels/non_linear_regression_based  \
+	perfmodels/non_linear_regression_based	\
+	perfmodels/feed				\
 	sched_policies/data_locality            \
 	sched_policies/execute_all_tasks        \
 	sched_policies/simple_deps              \
@@ -550,6 +551,9 @@ perfmodels_non_linear_regression_based_SOURCES+=\
 	perfmodels/opencl_memset.c
 endif
 
+perfmodels_feed_SOURCES=\
+	perfmodels/feed.c
+
 sched_policies_execute_all_tasks_LDFLAGS = -lm
 
 showcheck:

+ 85 - 0
tests/perfmodels/feed.c

@@ -0,0 +1,85 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Test the starpu_perfmodel_update_history function
+ */
+
+#include <config.h>
+#include <starpu.h>
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+#include "../helper.h"
+
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_REGRESSION_BASED,
+	.symbol = "feed"
+};
+
+static struct starpu_perfmodel nl_model =
+{
+	.type = STARPU_NL_REGRESSION_BASED,
+	.symbol = "nlfeed"
+};
+
+static struct starpu_codelet cl =
+{
+	.model = &model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+int main(int argc, char **argv)
+{
+	struct starpu_task task;
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_task_init(&task);
+	task.cl = &cl;
+
+	int size;
+	for (size = 1024; size < 16777216; size *= 2)
+	{
+		float measured_fast, measured_slow;
+		starpu_data_handle_t handle;
+		starpu_vector_data_register(&handle, -1, 0, size, sizeof(float));
+		task.handles[0] = handle;
+
+		/* Simulate Fast GPU. In real applications this would be
+		 * replaced by fetching from actual measurement */
+		measured_fast = 0.002+size*0.00000001;
+		measured_slow = 0.001+size*0.0000001;
+
+		/* Simulate Fast GPU */
+		starpu_perfmodel_update_history(&model, &task, STARPU_CUDA_DEFAULT, 0, 0, measured_fast);
+		starpu_perfmodel_update_history(&nl_model, &task, STARPU_CUDA_DEFAULT, 0, 0, measured_fast);
+		/* Simulate Slow GPU */
+		starpu_perfmodel_update_history(&model, &task, STARPU_CUDA_DEFAULT + 1, 0, 0, measured_slow);
+		starpu_perfmodel_update_history(&nl_model, &task, STARPU_CUDA_DEFAULT + 1, 0, 0, measured_slow);
+		starpu_task_deinit(&task);
+		starpu_data_unregister(handle);
+	}
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}

+ 1 - 1
tools/dev/starpu_check_undocumented.sh

@@ -56,7 +56,7 @@ done
 
 echo
 
-macros=$(grep "define\b" include/*|grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
+macros=$(grep "define\b" include/*.h |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
 for macro in $macros ; do
     x=$(grep "$macro\b" doc/starpu.texi doc/chapters/*texi | grep defmac)
     if test "$x" == "" ; then

+ 48 - 29
tools/starpu_machine_display.c

@@ -49,7 +49,7 @@ static void display_combined_worker(unsigned workerid)
 		char name[256];
 
 		starpu_worker_get_name(combined_workerid[i], name, 256);
-		
+
 		fprintf(stdout, "%s\t", name);
 	}
 
@@ -72,50 +72,65 @@ static void display_all_combined_workers(void)
 		display_combined_worker(nworkers + i);
 }
 
-static void parse_args(int argc, char **argv)
+static void parse_args(int argc, char **argv, int *force)
 {
+	int i;
+
 	if (argc == 1)
 		return;
 
-	if (argc > 2 || /* Argc should be either 1 or 2 */
-	    strncmp(argv[1], "--help", 6) == 0 ||
-	    strncmp(argv[1], "-h", 2) == 0)
+	for (i = 1; i < argc; i++)
 	{
-		(void) fprintf(stderr, "\
-Show the processing units that StarPU can use, and the \
-bandwitdh measured between the memory nodes.                  \n\
-                                                              \n\
-Usage: %s [OPTION]                                            \n\
-                                                              \n\
-Options:                                                      \n\
-	-h, --help       display this help and exit           \n\
-	-v, --version    output version information and exit  \n\
-                                                              \n\
-Report bugs to <" PACKAGE_BUGREPORT ">.",
+		if (strncmp(argv[i], "--force", 7) == 0 || strncmp(argv[i], "-f", 2) == 0)
+		{
+			*force = 1;
+		}
+		else if (strncmp(argv[i], "--help", 6) == 0 || strncmp(argv[i], "-h", 2) == 0)
+		{
+			(void) fprintf(stderr, "\
+Show the processing units that StarPU can use, and the	      \n	\
+bandwitdh and affinity measured between the memory nodes.     \n	\
+                                                              \n	\
+Usage: %s [OPTION]                                            \n	\
+                                                              \n	\
+Options:                                                      \n	\
+	-h, --help       display this help and exit           \n	\
+	-v, --version    output version information and exit  \n	\
+        -f, --force      force bus sampling                   \n	\
+                                                              \n	\
+Report bugs to <" PACKAGE_BUGREPORT ">.\n",
 PROGNAME);
+			exit(EXIT_FAILURE);
+		}
+		else if (strncmp(argv[i], "--version", 9) == 0 || strncmp(argv[i], "-v", 2) == 0)
+		{
+			(void) fprintf(stderr, "%s %d.%d\n",
+				       PROGNAME, STARPU_MAJOR_VERSION, STARPU_MINOR_VERSION);
+			exit(EXIT_FAILURE);
+		}
+		else
+		{
+			fprintf(stderr, "Unknown arg %s\n", argv[1]);
+			exit(EXIT_FAILURE);
+		}
 	}
-	else if (strncmp(argv[1], "--version", 9) == 0 ||
-		 strncmp(argv[1], "-v", 2) == 0)
-	{
-		(void) fprintf(stderr, "%s %d.%d\n",
-			PROGNAME, STARPU_MAJOR_VERSION, STARPU_MINOR_VERSION);
-	}
-	else
-	{
-		fprintf(stderr, "Unknown arg %s\n", argv[1]);
-	}
-
-	exit(EXIT_FAILURE);
 }
 
 int main(int argc, char **argv)
 {
-	parse_args(argc, argv);
+	int force = 0;
+
+	parse_args(argc, argv, &force);
 
 	/* Even if starpu_init returns -ENODEV, we should go on : we will just
 	 * print that we found no device. */
 	(void) starpu_init(NULL);
 
+	if (force)
+	{
+		starpu_force_bus_sampling();
+	}
+
 	unsigned ncpu = starpu_cpu_worker_get_count();
 	unsigned ncuda = starpu_cuda_worker_get_count();
 	unsigned nopencl = starpu_opencl_worker_get_count();
@@ -133,8 +148,12 @@ int main(int argc, char **argv)
 
 	display_all_combined_workers();
 
+	fprintf(stdout, "\nbandwidth ...\n");
 	starpu_bus_print_bandwidth(stdout);
 
+	fprintf(stdout, "\naffinity ...\n");
+	starpu_bus_print_affinity(stdout);
+
 	starpu_shutdown();
 
 	return 0;