瀏覽代碼

merge trunk

Thibaud Lambert 11 年之前
父節點
當前提交
4666e8726a
共有 100 個文件被更改,包括 4387 次插入353 次删除
  1. 1 0
      AUTHORS
  2. 16 0
      ChangeLog
  3. 1 0
      Makefile.am
  4. 54 38
      configure.ac
  5. 17 2
      doc/doxygen/Makefile.am
  6. 18 4
      doc/doxygen/chapters/advanced_examples.doxy
  7. 4 0
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  8. 4 4
      doc/doxygen/chapters/api/data_interfaces.doxy
  9. 48 0
      doc/doxygen/chapters/api/data_out_of_core.doxy
  10. 2 2
      doc/doxygen/chapters/api/data_partition.doxy
  11. 16 0
      doc/doxygen/chapters/api/mpi.doxy
  12. 176 91
      doc/doxygen/chapters/basic_examples.doxy
  13. 5 2
      doc/doxygen/chapters/building.doxy
  14. 179 0
      doc/doxygen/chapters/code/disk_compute.c
  15. 122 0
      doc/doxygen/chapters/code/disk_copy.c
  16. 1 1
      doc/doxygen/chapters/code/multiformat.c
  17. 2 2
      doc/doxygen/chapters/code/vector_scal_c.c
  18. 7 7
      doc/doxygen/chapters/configure_options.doxy
  19. 5 5
      doc/doxygen/chapters/environment_variables.doxy
  20. 1 0
      doc/doxygen/chapters/files.doxy
  21. 2 1
      doc/doxygen/chapters/introduction.doxy
  22. 3 3
      doc/doxygen/chapters/mpi_support.doxy
  23. 17 13
      doc/doxygen/chapters/optimize_performance.doxy
  24. 56 0
      doc/doxygen/chapters/out_of_core.doxy
  25. 7 0
      doc/doxygen/chapters/performance_feedback.doxy
  26. 1138 0
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based.eps
  27. 二進制
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based.pdf
  28. 二進制
      doc/doxygen/chapters/starpu_non_linear_memset_regression_based.png
  29. 1036 0
      doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.eps
  30. 二進制
      doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.pdf
  31. 二進制
      doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.png
  32. 1253 0
      doc/doxygen/chapters/tasks_size_overhead.eps
  33. 二進制
      doc/doxygen/chapters/tasks_size_overhead.pdf
  34. 二進制
      doc/doxygen/chapters/tasks_size_overhead.png
  35. 2 0
      doc/doxygen/doxygen-config.cfg.in
  36. 2 1
      doc/doxygen/doxygen.cfg
  37. 9 0
      doc/doxygen/refman.tex
  38. 3 3
      doc/texinfo/chapters/advanced-examples.texi
  39. 6 6
      doc/texinfo/chapters/api.texi
  40. 2 2
      doc/texinfo/chapters/basic-examples.texi
  41. 3 3
      doc/texinfo/chapters/configuration.texi
  42. 3 3
      doc/texinfo/chapters/mpi-support.texi
  43. 3 3
      doc/texinfo/chapters/vector_scal_c.texi
  44. 2 2
      doc/tutorial/vector_scal.c
  45. 2 2
      examples/audio/starpu_audio_processing.c
  46. 4 4
      examples/axpy/axpy.c
  47. 1 1
      examples/basic_examples/block.c
  48. 1 1
      examples/basic_examples/dynamic_handles.c
  49. 6 6
      examples/basic_examples/mult.c
  50. 1 1
      examples/basic_examples/multiformat.c
  51. 1 1
      examples/basic_examples/variable.c
  52. 2 2
      examples/basic_examples/vector_scal.c
  53. 3 3
      examples/basic_examples/vector_scal_c.c
  54. 1 1
      examples/binary/binary.c
  55. 1 1
      examples/callback/callback.c
  56. 14 14
      examples/cg/cg.c
  57. 3 3
      examples/cholesky/cholesky_grain_tag.c
  58. 2 2
      examples/cholesky/cholesky_implicit.c
  59. 3 3
      examples/cholesky/cholesky_tag.c
  60. 1 1
      examples/cholesky/cholesky_tile_tag.c
  61. 1 1
      examples/cpp/incrementer_cpp.cpp
  62. 3 3
      examples/filters/custom_mf/custom_interface.c
  63. 2 2
      examples/filters/custom_mf/custom_mf_filter.c
  64. 2 2
      examples/filters/fblock.c
  65. 2 2
      examples/filters/fmatrix.c
  66. 2 2
      examples/filters/fvector.c
  67. 4 4
      examples/filters/shadow.c
  68. 4 4
      examples/filters/shadow2d.c
  69. 4 4
      examples/filters/shadow3d.c
  70. 2 2
      examples/heat/dw_factolu.c
  71. 3 3
      examples/heat/dw_factolu_grain.c
  72. 3 3
      examples/heat/dw_factolu_tag.c
  73. 6 6
      examples/heat/dw_sparse_cg.c
  74. 1 1
      examples/incrementer/incrementer.c
  75. 2 2
      examples/interface/complex.c
  76. 3 3
      examples/interface/complex_interface.c
  77. 2 1
      examples/lu/lu_example.c
  78. 2 2
      examples/lu/xlu.c
  79. 2 2
      examples/lu/xlu_implicit.c
  80. 3 3
      examples/lu/xlu_implicit_pivot.c
  81. 3 3
      examples/lu/xlu_pivot.c
  82. 1 1
      examples/mandelbrot/mandelbrot.c
  83. 3 3
      examples/matvecmult/matvecmult.c
  84. 6 6
      examples/mult/xgemm.c
  85. 1 1
      examples/openmp/vector_scal_omp.c
  86. 3 3
      examples/pi/pi.c
  87. 1 1
      examples/pi/pi_redux.c
  88. 6 6
      examples/ppm_downscaler/yuv_downscaler.c
  89. 3 3
      examples/reductions/dot_product.c
  90. 2 2
      examples/reductions/minmax_reduction.c
  91. 1 1
      examples/spmd/vector_scal_spmd.c
  92. 7 7
      examples/spmv/dw_block_spmv.c
  93. 5 5
      examples/spmv/spmv.c
  94. 1 1
      examples/stencil/stencil-blocks.c
  95. 1 1
      gcc-plugin/tests/output-pointer.c
  96. 2 2
      gcc-plugin/tests/pointers.c
  97. 1 0
      include/starpu.h
  98. 4 0
      include/starpu_data.h
  99. 11 11
      include/starpu_data_interfaces.h
  100. 0 0
      include/starpu_deprecated_api.h

+ 1 - 0
AUTHORS

@@ -15,6 +15,7 @@ Damien Pasqualinotto <dam.pasqualinotto@wanadoo.fr>
 Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
 Cyril Roelandt <cyril.roelandt@inria.fr>
 Anthony Roy <theanthony33@gmail.com>
+Corentin Salingue <corentin.salingue@gmail.com>
 Ludovic Stordeur <ludovic.stordeur@inria.fr>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>

+ 16 - 0
ChangeLog

@@ -32,6 +32,14 @@ New features:
 	  the envelope.
   * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
     let starpu commute write accesses.
+  * Out-of-core support, through registration of disk areas as additional memory
+    nodes.
+  * StarPU-MPI: new function
+    starpu_mpi_irecv_detached_sequential_consistency which allows to
+    enable or disable the sequential consistency for the given data
+    handle (sequential consistency will be enabled or disabled based
+    on the value of the function parameter and the value of the
+    sequential consistency defined for the given data)
 
 Small features:
   * Add cl_arg_free field to enable automatic free(cl_arg) on task
@@ -39,7 +47,15 @@ Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and
     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
     to enable or disable sequential consistency
+  * New configure option --enable-fxt-lock which enables additional
+    trace events focused on locks behaviour during the execution
 
+Changes:
+  * Fix of the livelock issue discovered while executing applications
+    on a CPU+GPU cluster of machines by adding a maximum trylock 
+    threshold before a blocking lock.
+  * Data interfaces (variable, vector, matrix and block) now define
+    pack und unpack functions
 
 StarPU 1.1.0 (svn revision xxxx)
 ==============================================

+ 1 - 0
Makefile.am

@@ -79,6 +79,7 @@ versinclude_HEADERS = 				\
 	include/starpu_deprecated_api.h         \
 	include/starpu_hash.h			\
 	include/starpu_rand.h			\
+	include/starpu_disk.h			\
 	include/starpu_cublas.h			\
 	include/starpu_driver.h			\
 	include/starpu_stdlib.h			\

+ 54 - 38
configure.ac

@@ -134,8 +134,13 @@ case "$target" in
   libext=a
   AC_DEFINE(STARPU_HAVE_WINDOWS, [], [Define this on windows.])
   ;;
+*-*-linux*)
+  starpu_linux=yes
+  AC_DEFINE(STARPU_LINUX_SYS, 1, [Define to 1 on Linux])
+  ;;
 esac
 AM_CONDITIONAL([STARPU_HAVE_WINDOWS], [test "x$starpu_windows" = "xyes"])
+AM_CONDITIONAL([STARPU_LINUX_SYS], [test "x$starpu_linux" = "xyes"])
 
 # on Darwin, GCC targets i386 by default, so we don't have atomic ops
 AC_CHECK_SIZEOF([void *])
@@ -959,13 +964,13 @@ AC_MSG_RESULT($nmaxmicdev)
 AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
 	[maximum number of MIC devices])
 
-AC_MSG_CHECKING(maximum number of MIC cores)
-AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmiccore=<number>],
-			[maximum number of MIC cores])],
-			nmaxmiccore=$enableval, nmaxmiccore=128)
-AC_MSG_RESULT($nmaxmiccore)
+AC_MSG_CHECKING(maximum number of MIC threads)
+AC_ARG_ENABLE(maxmicthreads, [AS_HELP_STRING([--enable-maxmicthreads=<number>],
+			[maximum number of MIC threads])],
+			nmaxmicthreads=$enableval, nmaxmicthreads=128)
+AC_MSG_RESULT($nmaxmicthread)
 
-AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmiccore],
+AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmicthreads],
 	[maximum number of MIC cores])
 
 AC_ARG_WITH(coi-dir,
@@ -1379,38 +1384,48 @@ AC_MSG_RESULT($nmaxbuffers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 		[how many buffers can be manipulated per task])
 
-# TODO: add option to choose maxnodes
-if test x$enable_simgrid = xyes ; then
-	# We still need the room for the virtual CUDA/OpenCL devices
-	maxnodes=16
-else
-	# We have one memory node shared by all CPU workers, one node per GPU
-	# and per MIC device
-	nodes=1
-	if test x$enable_cuda = xyes ; then
-		# we could have used nmaxcudadev + 1, but this would certainly give an
-		# odd number.
-		nodes=`expr $nodes + $nmaxcudadev`
-	fi
-	if test x$enable_opencl = xyes ; then
-		# we could have used nmaxcudadev + 1, but this would certainly give an
-		# odd number.
-		nodes=`expr $nodes + $nmaxopencldev`
-	fi
-	if test x$enable_mic = xyes ; then
-		nodes=`expr $nodes + $nmaxmicdev`
-	fi
-	if test x$enable_rcce = xyes ; then
-		# Only 1 memory node for the shared memory.
-		nodes=`expr $nodes + 1`
-	fi
+AC_MSG_CHECKING(maximum number of nodes to use)
+AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
+			[maximum number of nodes])],
+			maxnodes=$enableval, maxnodes=0)
 
-	# set maxnodes to the next power of 2 greater than nodes
-	maxnodes=1
-	while test "$maxnodes" -lt "$nodes"
-	do
-		maxnodes=`expr $maxnodes \* 2`
-	done
+if test x$maxnodes = x0 ; then
+	if test x$enable_simgrid = xyes ; then
+		# We still need the room for the virtual CUDA/OpenCL devices
+		maxnodes=16
+	else
+		# We have one memory node shared by all CPU workers, one node per GPU
+		# and per MIC device
+		# we add nodes to use 4 memory disks
+		nodes=5
+		if test x$enable_cuda = xyes ; then
+			# we could have used nmaxcudadev + 1, but this would certainly give an
+			# odd number.
+			nodes=`expr $nodes + $nmaxcudadev`
+		fi
+		if test x$enable_opencl = xyes ; then
+			# we could have used nmaxcudadev + 1, but this would certainly give an
+			# odd number.
+			nodes=`expr $nodes + $nmaxopencldev`
+		fi
+		if test x$enable_mic = xyes ; then
+			nodes=`expr $nodes + $nmaxmicdev`
+		fi
+		if test x$enable_rcce = xyes ; then
+			# Only 1 memory node for the shared memory.
+			nodes=`expr $nodes + 1`
+		fi
+
+		# set maxnodes to the next power of 2 greater than nodes
+		maxnodes=1
+		while test "$maxnodes" -lt "$nodes"
+		do
+			maxnodes=`expr $maxnodes \* 2`
+		done
+ 	fi
+fi
+if test $maxnodes -gt 32 ; then
+	AC_MSG_ERROR([selected number of nodes ($maxnodes) can not be greater than 32])
 fi
 
 AC_MSG_CHECKING(maximum number of memory nodes)
@@ -1456,7 +1471,7 @@ AC_CHECK_FUNCS([clock_gettime])
 
 # Compute the maximum number of workers (we round it to 16 for alignment
 # purposes).
-nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmiccore + $nmaxsccdev + 15 \) / 16 \) `
+nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxsccdev + 15 \) / 16 \) `
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
@@ -2273,6 +2288,7 @@ AC_MSG_NOTICE([
 	Maximum number of CPUs:           $maxcpus
 	Maximum number of CUDA devices:   $nmaxcudadev
 	Maximum number of OpenCL devices: $nmaxopencldev
+	Maximum number of MIC threads:    $nmaxmicthreads
 	Maximum number of memory nodes:   $maxnodes
 	Maximum number of task buffers:   $nmaxbuffers
 

+ 17 - 2
doc/doxygen/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2011  Université de Bordeaux 1
+# Copyright (C) 2009, 2011, 2013  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # Permission is granted to copy, distribute and/or modify this document
@@ -35,6 +35,7 @@ chapters =	\
 	chapters/performance_feedback.doxy \
 	chapters/scheduling_context_hypervisor.doxy \
 	chapters/scheduling_contexts.doxy \
+	chapters/out_of_core.doxy \
 	chapters/socl_opencl_extensions.doxy \
 	chapters/tips_and_tricks.doxy \
 	chapters/environment_variables.doxy \
@@ -58,11 +59,14 @@ chapters =	\
 	chapters/code/vector_scal_cuda.cu \
 	chapters/code/vector_scal_opencl.c \
 	chapters/code/vector_scal_opencl_codelet.cl \
+	chapters/code/disk_copy.c \
+	chapters/code/disk_compute.c \
 	chapters/api/codelet_and_tasks.doxy \
 	chapters/api/cuda_extensions.doxy \
 	chapters/api/data_interfaces.doxy \
 	chapters/api/data_management.doxy \
 	chapters/api/data_partition.doxy \
+	chapters/api/data_out_of_core.doxy \
 	chapters/api/expert_mode.doxy \
 	chapters/api/explicit_dependencies.doxy \
 	chapters/api/fft_support.doxy \
@@ -105,7 +109,7 @@ chapters/version.sty: $(chapters)
 	@if test -s timestamp_updated ; then \
 		echo "\newcommand{\STARPUUPDATED}{"`cat timestamp_updated`"}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	else \
-		echo "\newcommand{\STARPUUPDATED}{unknown_date}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
+		echo "\newcommand{\STARPUUPDATED}{unknown date}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	fi
 	@echo "\newcommand{\STARPUVERSION}{$(VERSION)}" >> $(top_srcdir)/doc/doxygen/chapters/version.sty
 	@-for f in timestamp timestamp_updated timestamp_updated_month ; do \
@@ -134,6 +138,15 @@ EXTRA_DIST	= 		\
 	$(chapters) 		\
 	chapters/version.sty	\
 	chapters/version.html	\
+	chapters/tasks_size_overhead.png	\
+	chapters/tasks_size_overhead.eps	\
+	chapters/tasks_size_overhead.pdf	\
+	chapters/starpu_non_linear_memset_regression_based.png	\
+	chapters/starpu_non_linear_memset_regression_based.eps	\
+	chapters/starpu_non_linear_memset_regression_based.pdf	\
+	chapters/starpu_starpu_slu_lu_model_11.png	\
+	chapters/starpu_starpu_slu_lu_model_11.eps	\
+	chapters/starpu_starpu_slu_lu_model_11.pdf	\
 	doxygen.cfg 		\
 	refman.tex
 
@@ -145,6 +158,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu.h			\
 	$(top_srcdir)/include/starpu_data_filters.h	\
 	$(top_srcdir)/include/starpu_data_interfaces.h	\
+	$(top_srcdir)/include/starpu_disk.h		\
 	$(top_srcdir)/include/starpu_worker.h		\
 	$(top_srcdir)/include/starpu_task.h		\
 	$(top_srcdir)/include/starpu_task_bundle.h	\
@@ -189,6 +203,7 @@ dist_pdf_DATA = $(DOX_PDF)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
+	cp $(top_srcdir)/doc/doxygen/chapters/*pdf $(DOX_LATEX_DIR)
 	cd $(DOX_LATEX_DIR); \
 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
 	sed -i -e 's/__env__/\\_Environment Variables!/' -e 's/\\-\\_\\-\\-\\_\\-env\\-\\_\\-\\-\\_\\-//' ExecutionConfigurationThroughEnvironmentVariables.tex ;\

+ 18 - 4
doc/doxygen/chapters/advanced_examples.doxy

@@ -201,7 +201,7 @@ int vector[NX];
 starpu_data_handle_t handle;
 
 /* Declare data to StarPU */
-starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
+starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
                             NX, sizeof(vector[0]));
 
 /* Partition the vector in PARTS sub-vectors */
@@ -390,10 +390,25 @@ starpu_perfmodel::size_base however permits the application to
 override that, when for instance some of the data do not matter for
 task cost (e.g. mere reference table), or when using sparse
 structures (in which case it is the number of non-zeros which matter), or when
-there is some hidden parameter such as the number of iterations, etc.
+there is some hidden parameter such as the number of iterations, or when the application
+actually has a very good idea of the complexity of the algorithm, and just not
+the speed of the processor, etc.
 The example in the directory <c>examples/pi</c> uses this to include
 the number of iterations in the base.
 
+StarPU will automatically determine when the performance model is calibrated,
+or rather, it will assume the performance model is calibrated until the
+application submits a task for which the performance can not be predicted. For
+::STARPU_HISTORY_BASED, StarPU will require 10 (::_STARPU_CALIBRATION_MINIMUM)
+measurements for a given size before estimating that an average can be taken as
+estimation for further executions with the same size. For
+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
+10 (::_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
+data size is smaller than 90% of the maximum measured data size (i.e. the
+measurement interval is large enough for a regression to have a meaning).
+Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
+variable to <c>1</c>, or even reset by setting it to <c>2</c>.
+
 How to use schedulers which can benefit from such performance model is explained
 in \ref TaskSchedulingPolicy.
 
@@ -1106,8 +1121,7 @@ Complex data interfaces can then be registered to StarPU.
 
 \code{.c}
 double real = 45.0;
-double imaginary = 12.0;
-starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1);
+double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
 starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
 \endcode
 

+ 4 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -82,6 +82,10 @@ specify the codelet may be executed on a MIC processing unit.
 This macro is used when setting the field starpu_codelet::where to
 specify the codelet may be executed on an SCC processing unit.
 
+\def STARPU_MAIN_RAM
+\ingroup API_Codelet_And_Tasks
+This macro is used when the RAM memory node is specified.
+
 \def STARPU_MULTIPLE_CPU_IMPLEMENTATIONS
 \deprecated
 \ingroup API_Codelet_And_Tasks

+ 4 - 4
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -212,7 +212,7 @@ Here an example of how to use the function.
 \code{.c}
 float var;
 starpu_data_handle_t var_handle;
-starpu_variable_data_register(&var_handle, 0, (uintptr_t)&var, sizeof(var));
+starpu_variable_data_register(&var_handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
 \endcode
 
 \fn void starpu_vector_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t nx, size_t elemsize)
@@ -223,7 +223,7 @@ Here an example of how to use the function.
 \code{.c}
 float vector[NX];
 starpu_data_handle_t vector_handle;
-starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 \endcode
 
 \fn void starpu_matrix_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ld, uint32_t nx, uint32_t ny, size_t elemsize)
@@ -238,7 +238,7 @@ Here an example of how to use the function.
 float *matrix;
 starpu_data_handle_t matrix_handle;
 matrix = (float*)malloc(width * height * sizeof(float));
-starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix, width, width, height, sizeof(float));
+starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
 \endcode
 
 \fn void starpu_block_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx, uint32_t ny, uint32_t nz, size_t elemsize)
@@ -252,7 +252,7 @@ Here an example of how to use the function.
 float *block;
 starpu_data_handle_t block_handle;
 block = (float*)malloc(nx*ny*nz*sizeof(float));
-starpu_block_data_register(&block_handle, 0, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
+starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
 \endcode
 
 \fn void starpu_bcsr_data_register(starpu_data_handle_t *handle, unsigned home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize)

+ 48 - 0
doc/doxygen/chapters/api/data_out_of_core.doxy

@@ -0,0 +1,48 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2013 Corentin Salingue
+ * See the file version.doxy for copying conditions.
+ */
+
+
+/*! \defgroup API_Out_Of_Core Out Of Core
+
+
+
+\struct starpu_disk_ops
+\ingroup API_Out_Of_Core
+This is a set of functions to manipulate datas on disk.
+
+\fn int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, size_t size) 
+\ingroup API_Out_Of_Core
+Register a disk memory node with a set of functions to manipulate datas. <br />
+SUCCESS: return the disk node. <br />
+FAIL: return an error code. <br />
+The \p size must be at least 1 MB !
+
+\fn void * starpu_disk_open (unsigned node, void *pos, size_t size)
+\ingroup API_Out_Of_Core
+Add an existing file memory in a disk node. The \p pos is defined in the starpu_disk_ops. \p size: this is a size of your file.
+\p pos is the name of the file.
+
+\fn void starpu_disk_close (unsigned node, void *obj, size_t size)
+\ingroup API_Out_Of_Core
+Close an existing file memory opened with starpu_disk_open.
+
+\var starpu_disk_stdio_ops
+\ingroup API_Out_Of_Core
+This set uses the stdio library (fwrite, fread...) to read/write on disk. <br />
+<strong>Warning: It creates one file per allocation !</strong>  <br />
+
+\var starpu_disk_unistd_ops
+\ingroup API_Out_Of_Core
+This set uses the unistd library (write, read...) to read/write on disk. <br />
+<strong>Warning: It creates one file per allocation !</strong>  <br />
+
+\var starpu_disk_unistd_o_direct_ops
+\ingroup API_Out_Of_Core
+This set uses the unistd library (write, read...) to read/write on disk with the O_DIRECT flag. <br />
+<strong>Warning: It creates one file per allocation !</strong>  <br />
+Only available on Linux.
+
+*/

+ 2 - 2
doc/doxygen/chapters/api/data_partition.doxy

@@ -54,12 +54,12 @@ starpu_data_partition(A_handle, &f);
 \ingroup API_Data_Partition
 This unapplies one filter, thus unpartitioning the data. The
 pieces of data are collected back into one big piece in the
-\p gathering_node (usually 0). Tasks working on the partitioned data must
+\p gathering_node (usually STARPU_MAIN_RAM). Tasks working on the partitioned data must
 be already finished when calling starpu_data_unpartition().
 
 Here an example of how to use the function.
 \code{.c}
-starpu_data_unpartition(A_handle, 0);
+starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
 \endcode
 
 \fn int starpu_data_get_nb_children(starpu_data_handle_t handle)

+ 16 - 0
doc/doxygen/chapters/api/mpi.doxy

@@ -98,6 +98,22 @@ communication completes, its resources are automatically released back
 to the system, there is no need to test or to wait for the completion
 of the request.
 
+\fn int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
+\ingroup API_MPI_Support
+Posts a nonblocking receive in \p data_handle from the node \p source
+using the message tag \p mpi_tag within the communicator \p comm. On
+completion, the \p callback function is called with the argument \p
+arg.
+The parameter \p sequential_consistency allows to enable or disable
+the sequential consistency for \p data handle (sequential consistency
+will be enabled or disabled based on the value of the parameter \p
+sequential_consistency and the value of the sequential consistency
+defined for \p data_handle).
+Similarly to the pthread detached functionality, when a detached
+communication completes, its resources are automatically released back
+to the system, there is no need to test or to wait for the completion
+of the request.
+
 \fn int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status)
 \ingroup API_MPI_Support
 Returns when the operation identified by request \p req is complete.

+ 176 - 91
doc/doxygen/chapters/basic_examples.doxy

@@ -60,64 +60,143 @@ The header starpu.h should be included in any code using StarPU.
 
 \subsection DefiningACodelet Defining A Codelet
 
+A codelet is a structure that represents a computational kernel. Such a codelet
+may contain an implementation of the same kernel on different architectures
+(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
+structure is properly initialized to zero, either by using the
+function starpu_codelet_init(), or by letting the
+compiler implicitly do it as examplified above.
+
+The field starpu_codelet::nbuffers specifies the number of data buffers that are
+manipulated by the codelet: here the codelet does not access or modify any data
+that is controlled by our data management library.
+
+We create a codelet which may only be executed on the CPUs. When a CPU
+core will execute a codelet, it will call the function
+<c>cpu_func</c>, which \em must have the following prototype:
+
+\code{.c}
+void (*cpu_func)(void *buffers[], void *cl_arg);
+\endcode
+
+In this example, we can ignore the first argument of this function which gives a
+description of the input and output buffers (e.g. the size and the location of
+the matrices) since there is none. We also ignore the second argument
+which is a pointer to optional arguments for the codelet.
+
 \code{.c}
-struct params
-{
-    int i;
-    float f;
-};
 void cpu_func(void *buffers[], void *cl_arg)
 {
-    struct params *params = cl_arg;
-
-    printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
+    printf("Hello world\n");
 }
 
 struct starpu_codelet cl =
 {
-    .where = STARPU_CPU,
     .cpu_funcs = { cpu_func, NULL },
-    .cpu_funcs_name = { "cpu_func", NULL },
     .nbuffers = 0
 };
 \endcode
 
-A codelet is a structure that represents a computational kernel. Such a codelet
-may contain an implementation of the same kernel on different architectures
-(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
-structure is properly initialized to zero, either by using the
-function starpu_codelet_init(), or by letting the
-compiler implicitly do it as examplified above.
+\subsection SubmittingATask Submitting A Task
 
-The field starpu_codelet::nbuffers specifies the number of data buffers that are
-manipulated by the codelet: here the codelet does not access or modify any data
-that is controlled by our data management library. Note that the argument
-passed to the codelet (the parameter <c>cl_arg</c> of the function
-<c>cpu_func</c>) does not count as a buffer since it is not managed by
-our data management library, but just contain trivial parameters.
+Before submitting any tasks to StarPU, starpu_init() must be called. The
+<c>NULL</c> argument specifies that we use the default configuration. Tasks cannot
+be submitted after the termination of StarPU by a call to
+starpu_shutdown().
+
+In the example above, a task structure is allocated by a call to
+starpu_task_create(). This function only allocates and fills the
+corresponding structure with the default settings, but it does not
+submit the task to StarPU.
 
 \internal
-TODO need a crossref to the proper description of "where" see bla for more ...
+not really clear ;)
 \endinternal
 
-We create a codelet which may only be executed on the CPUs. The field
-starpu_codelet::where is a bitmask that defines where the codelet may
-be executed. Here, the value ::STARPU_CPU means that only CPUs can
-execute this codelet. Note that field starpu_codelet::where is
-optional, when unset its value is automatically set based on the
-availability of the different fields <c>XXX_funcs</c>.
-When a CPU core executes a codelet, it calls the function
-<c>cpu_func</c>, which \em must have the following prototype:
+The field starpu_task::cl is a pointer to the codelet which the task will
+execute: in other words, the codelet structure describes which computational
+kernel should be offloaded on the different architectures, and the task
+structure is a wrapper containing a codelet and the piece of data on which the
+codelet should operate.
+
+If the field starpu_task::synchronous is non-zero, task submission
+will be synchronous: the function starpu_task_submit() will not return
+until the task has been executed. Note that the function starpu_shutdown()
+does not guarantee that asynchronous tasks have been executed before
+it returns, starpu_task_wait_for_all() can be used to that effect, or
+data can be unregistered (starpu_data_unregister()), which will
+implicitly wait for all the tasks scheduled to work on it, unless
+explicitly disabled thanks to
+starpu_data_set_default_sequential_consistency_flag() or
+starpu_data_set_sequential_consistency_flag().
 
 \code{.c}
-void (*cpu_func)(void *buffers[], void *cl_arg);
+int main(int argc, char **argv)
+{
+    /* initialize StarPU */
+    starpu_init(NULL);
+
+    struct starpu_task *task = starpu_task_create();
+
+    task->cl = &cl; /* Pointer to the codelet defined above */
+
+    /* starpu_task_submit will be a blocking call. If unset,
+    starpu_task_wait() needs to be called after submitting the task. */
+    task->synchronous = 1;
+
+    /* submit the task to StarPU */
+    starpu_task_submit(task);
+
+    /* terminate StarPU */
+    starpu_shutdown();
+
+    return 0;
+}
 \endcode
 
-In this example, we can ignore the first argument of this function which gives a
-description of the input and output buffers (e.g. the size and the location of
-the matrices) since there is none.
-The second argument is a pointer to a buffer passed as an
-argument to the codelet by the means of the field starpu_task::cl_arg.
+\subsection ExecutionOfHelloWorld Execution Of Hello World
+
+\verbatim
+$ make hello_world
+cc $(pkg-config --cflags starpu-1.2)  $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
+$ ./hello_world
+Hello world
+\endverbatim
+
+\subsection PassingArgumentsToTheCodelet Passing Arguments To The Codelet
+
+The optional field starpu_task::cl_arg field is a pointer to a buffer
+(of size starpu_task::cl_arg_size) with some parameters for the kernel
+described by the codelet. For instance, if a codelet implements a
+computational kernel that multiplies its input vector by a constant,
+the constant could be specified by the means of this buffer, instead
+of registering it as a StarPU data. It must however be noted that
+StarPU avoids making copy whenever possible and rather passes the
+pointer as such, so the buffer which is pointed at must be kept allocated
+until the task terminates, and if several tasks are submitted with
+various parameters, each of them must be given a pointer to their
+own buffer.
+
+\code{.c}
+struct params
+{
+    int i;
+    float f;
+};
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+    struct params *params = cl_arg;
+
+    printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
+}
+\endcode
+
+As said before, the field starpu_codelet::nbuffers specifies the
+number of data buffers that are manipulated by the codelet. It does
+not count the argument --- the parameter <c>cl_arg</c> of the function
+<c>cpu_func</c> --- since it is not managed by our data management
+library, but just contains trivial parameters.
 
 \internal
 TODO rewrite so that it is a little clearer ?
@@ -130,14 +209,7 @@ buffer will be modified as well: this for instance implies that the buffer
 cannot be used as a synchronization medium. If synchronization is needed, data
 has to be registered to StarPU, see \ref VectorScalingUsingStarPUAPI.
 
-\subsection SubmittingATask Submitting A Task
-
 \code{.c}
-void callback_func(void *callback_arg)
-{
-    printf("Callback function (arg %x)\n", callback_arg);
-}
-
 int main(int argc, char **argv)
 {
     /* initialize StarPU */
@@ -151,9 +223,6 @@ int main(int argc, char **argv)
     task->cl_arg = &params;
     task->cl_arg_size = sizeof(params);
 
-    task->callback_func = callback_func;
-    task->callback_arg = 0x42;
-
     /* starpu_task_submit will be a blocking call */
     task->synchronous = 1;
 
@@ -167,37 +236,14 @@ int main(int argc, char **argv)
 }
 \endcode
 
-Before submitting any tasks to StarPU, starpu_init() must be called. The
-<c>NULL</c> argument specifies that we use the default configuration. Tasks cannot
-be submitted after the termination of StarPU by a call to
-starpu_shutdown().
-
-In the example above, a task structure is allocated by a call to
-starpu_task_create(). This function only allocates and fills the
-corresponding structure with the default settings, but it does not
-submit the task to StarPU.
-
-\internal
-not really clear ;)
-\endinternal
+\verbatim
+$ make hello_world
+cc $(pkg-config --cflags starpu-1.2)  $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
+$ ./hello_world
+Hello world (params = {1, 2.000000} )
+\endverbatim
 
-The field starpu_task::cl is a pointer to the codelet which the task will
-execute: in other words, the codelet structure describes which computational
-kernel should be offloaded on the different architectures, and the task
-structure is a wrapper containing a codelet and the piece of data on which the
-codelet should operate.
-
-The optional field starpu_task::cl_arg field is a pointer to a buffer
-(of size starpu_task::cl_arg_size) with some parameters for the kernel
-described by the codelet. For instance, if a codelet implements a
-computational kernel that multiplies its input vector by a constant,
-the constant could be specified by the means of this buffer, instead
-of registering it as a StarPU data. It must however be noted that
-StarPU avoids making copy whenever possible and rather passes the
-pointer as such, so the buffer which is pointed at must be kept allocated
-until the task terminates, and if several tasks are submitted with
-various parameters, each of them must be given a pointer to their
-own buffer.
+\subsection DefiningACallback Defining A Callback
 
 Once a task has been executed, an optional callback function
 starpu_task::callback_func is called when defined.
@@ -210,27 +256,66 @@ function. The prototype of a callback function must be:
 void (*callback_function)(void *);
 \endcode
 
-If the field starpu_task::synchronous is non-zero, task submission
-will be synchronous: the function starpu_task_submit() will not return
-until the task has been executed. Note that the function starpu_shutdown()
-does not guarantee that asynchronous tasks have been executed before
-it returns, starpu_task_wait_for_all() can be used to that effect, or
-data can be unregistered (starpu_data_unregister()), which will
-implicitly wait for all the tasks scheduled to work on it, unless
-explicitly disabled thanks to
-starpu_data_set_default_sequential_consistency_flag() or
-starpu_data_set_sequential_consistency_flag().
+\code{.c}
+void callback_func(void *callback_arg)
+{
+    printf("Callback function (arg %x)\n", callback_arg);
+}
 
-\subsection ExecutionOfHelloWorld Execution Of Hello World
+int main(int argc, char **argv)
+{
+    /* initialize StarPU */
+    starpu_init(NULL);
+
+    struct starpu_task *task = starpu_task_create();
+
+    task->cl = &cl; /* Pointer to the codelet defined above */
+
+    task->callback_func = callback_func;
+    task->callback_arg = 0x42;
+
+    /* starpu_task_submit will be a blocking call */
+    task->synchronous = 1;
+
+    /* submit the task to StarPU */
+    starpu_task_submit(task);
+
+    /* terminate StarPU */
+    starpu_shutdown();
+
+    return 0;
+}
+\endcode
 
 \verbatim
 $ make hello_world
 cc $(pkg-config --cflags starpu-1.2)  $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
 $ ./hello_world
-Hello world (params = {1, 2.000000} )
+Hello world
 Callback function (arg 42)
 \endverbatim
 
+\subsection WhereToExecuteACodelet Where To Execute A Codelet
+
+\code{.c}
+struct starpu_codelet cl =
+{
+    .where = STARPU_CPU,
+    .cpu_funcs = { cpu_func, NULL },
+    .cpu_funcs_name = { "cpu_func", NULL },
+     .nbuffers = 0
+};
+\endcode
+
+We create a codelet which may only be executed on the CPUs. The
+optional field starpu_codelet::where is a bitmask that defines where
+the codelet may be executed. Here, the value ::STARPU_CPU means that
+only CPUs can execute this codelet. When the optional field
+starpu_codelet::where is unset, its value is automatically set based
+on the availability of the different fields <c>XXX_funcs</c>.
+
+TODO: explain starpu_codelet::cpu_funcs_name
+
 \section VectorScalingUsingTheCExtension Vector Scaling Using the C Extension
 
 The previous example has shown how to submit tasks. In this section,
@@ -444,14 +529,14 @@ The following lines show how to declare an array of <c>NX</c> elements of type
 float vector[NX];
 
 starpu_data_handle_t vector_handle;
-starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX,
                             sizeof(vector[0]));
 \endcode
 
 The first argument, called the <b>data handle</b>, is an opaque pointer which
 designates the array in StarPU. This is also the structure which is used to
 describe which data is used by a task. The second argument is the node number
-where the data originally resides. Here it is 0 since the array <c>vector</c> is in
+where the data originally resides. Here it is STARPU_MAIN_RAM since the array <c>vector</c> is in
 the main memory. Then comes the pointer <c>vector</c> where the data can be found in main memory,
 the number of elements in the vector and the size of each element.
 The following shows how to construct a StarPU task that will manipulate the

+ 5 - 2
doc/doxygen/chapters/building.doxy

@@ -266,11 +266,14 @@ schedulers, for instance <c>STARPU_SCHED=dmda</c>.
 
 \subsection TaskSizeOverhead Task Size Overhead
 
-This benchmark gives a glimpse into how big a size should be for StarPU overhead
-to be low enough.  Run <c>tasks_size_overhead.sh</c>, it will generate a plot
+This benchmark gives a glimpse into how long a task should be (in µs) for StarPU overhead
+to be low enough to keep efficiency.  Run <c>tasks_size_overhead.sh</c>, it will generate a plot
 of the speedup of tasks of various sizes, depending on the number of CPUs being
 used.
 
+\image html tasks_size_overhead.png
+\image latex tasks_size_overhead.eps "" width=\textwidth
+
 \subsection DataTransferLatency Data Transfer Latency
 
 <c>local_pingpong</c> performs a ping-pong between the first two CUDA nodes, and

+ 179 - 0
doc/doxygen/chapters/code/disk_compute.c

@@ -0,0 +1,179 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013 Corentin Salingue
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+//! [To be included]
+/* Try to write into disk memory
+ * Use mechanism to push datas from main ram to disk ram
+ */
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <math.h>
+
+#define NX (100)
+
+int main(int argc, char **argv)
+{
+	/* Initialize StarPU with default configuration */
+	int ret = starpu_init(NULL);
+
+	if (ret == -ENODEV) goto enodev;
+
+	/* Initialize path and name */
+	char pid_str[16];
+	int pid = getpid();
+	snprintf(pid_str, 16, "%d", pid);
+
+	char * base = "/tmp/";
+
+	char * name_file_start = malloc(128*sizeof(char));
+	strcpy(name_file_start, "STARPU_DISK_COMPUTE_DATA_");
+	strcat(name_file_start, pid_str);
+
+	char * name_file_end = malloc(128*sizeof(char));
+	strcpy(name_file_end, "STARPU_DISK_COMPUTE_DATA_RESULT_");
+	strcat(name_file_end, pid_str);
+
+	char * path_file_start = malloc(128*sizeof(char));
+	strcpy(path_file_start, base);
+	strcat(path_file_start, name_file_start);
+
+	char * path_file_end = malloc(128*sizeof(char));
+	strcpy(path_file_end, base);
+	strcat(path_file_end, name_file_end);
+
+
+	/* register a disk */
+	int new_dd = starpu_disk_register(&starpu_disk_stdio_ops, (void *) base, 1024*1024*1);
+	/* can't write on /tmp/ */
+	if (new_dd == -ENOENT) goto enoent;
+	
+	unsigned dd = (unsigned) new_dd;
+
+	printf("TEST DISK MEMORY \n");
+
+	/* Imagine, you want to compute datas */
+	int *A;
+	int *C;
+
+	starpu_malloc_flags((void **)&A, NX*sizeof(int), STARPU_MALLOC_COUNT);
+	starpu_malloc_flags((void **)&C, NX*sizeof(int), STARPU_MALLOC_COUNT);
+ 
+	unsigned int j;
+	/* you register them in a vector */
+	for(j = 0; j < NX; ++j)
+	{
+		A[j] = j;
+		C[j] = 0;
+	}
+
+
+
+
+	/* you create a file to store the vector ON the disk */
+	FILE * f = fopen(path_file_start, "wb+");
+	if (f == NULL)
+		goto enoent;
+
+	/* store it in the file */
+	fwrite(A, sizeof(int), NX, f);
+
+	/* close the file */
+	fclose(f);
+
+
+	/* create a file to store result */
+	f = fopen(path_file_end, "wb+");
+	if (f == NULL)
+		goto enoent;
+
+	/* replace all datas by 0 */
+	fwrite(C, sizeof(int), NX, f);
+
+	/* close the file */
+	fclose(f);
+
+	/* And now, you want to use your datas in StarPU */
+	/* Open the file ON the disk */
+	void * data = starpu_disk_open(dd, (void *) name_file_start, NX*sizeof(int));
+	void * data_result = starpu_disk_open(dd, (void *) name_file_end, NX*sizeof(int));
+
+
+	starpu_data_handle_t vector_handleA, vector_handleC;
+
+	/* register vector in starpu */
+	starpu_vector_data_register(&vector_handleA, dd, (uintptr_t) data, NX, sizeof(int));
+
+	/* and do what you want with it, here we copy it into an other vector */ 
+	starpu_vector_data_register(&vector_handleC, dd, (uintptr_t) data_result, NX, sizeof(int));	
+
+	starpu_data_cpy(vector_handleC, vector_handleA, 0, NULL, NULL);
+
+	/* free them */
+	starpu_data_unregister(vector_handleA);
+	starpu_data_unregister(vector_handleC);
+
+	/* close them in StarPU */
+	starpu_disk_close(dd, data, NX*sizeof(int));
+	starpu_disk_close(dd, data_result, NX*sizeof(int));
+
+	/* check results */	
+	f = fopen(path_file_end, "rb+");
+	if (f == NULL)
+		goto enoent;
+	/* take datas */
+	int size = fread(C, sizeof(int), NX, f);
+
+	/* close the file */
+	fclose(f);
+
+	int try = 1;
+	for (j = 0; j < NX; ++j)
+		if (A[j] != C[j])
+		{
+			printf("Fail A %d != C %d \n", A[j], C[j]);
+			try = 0;
+		}
+
+	starpu_free_flags(A, NX*sizeof(double), STARPU_MALLOC_COUNT);
+	starpu_free_flags(C, NX*sizeof(double), STARPU_MALLOC_COUNT);
+
+	unlink(path_file_start);
+	unlink(path_file_end);
+
+	free(name_file_start);
+	free(name_file_end);
+	free(path_file_start);
+	free(path_file_end);
+
+	/* terminate StarPU, no task can be submitted after */
+	starpu_shutdown();
+
+	if(try)
+		printf("TEST SUCCESS\n");
+	else
+		printf("TEST FAIL\n");
+	return (try ? EXIT_SUCCESS : EXIT_FAILURE);
+
+enodev:
+	return 77;
+enoent:
+	return 77;
+}
+//! [To be included]
+

+ 122 - 0
doc/doxygen/chapters/code/disk_copy.c

@@ -0,0 +1,122 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013 Corentin Salingue
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+//! [To be included]
+
+/* Try to write into disk memory
+ * Use mechanism to push datas from main ram to disk ram
+ */
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+/* size of one vector */
+#define	NX	(30*1000000/sizeof(double))
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+
+int main(int argc, char **argv)
+{
+	double * A,*B,*C,*D,*E,*F;
+
+	/* limit main ram to force to push in disk */
+	putenv("STARPU_LIMIT_CPU_MEM=160");
+
+	/* Initialize StarPU with default configuration */
+	int ret = starpu_init(NULL);
+
+	if (ret == -ENODEV) goto enodev;
+
+	/* register a disk */
+	int new_dd = starpu_disk_register(&starpu_disk_stdio_ops, (void *) "/tmp/", 1024*1024*200);
+	/* can't write on /tmp/ */
+	if (new_dd == -ENOENT) goto enoent;
+	
+	unsigned dd = (unsigned) new_dd;
+
+	/* allocate two memory spaces */
+	starpu_malloc_flags((void **)&A, NX*sizeof(double), STARPU_MALLOC_COUNT);
+	starpu_malloc_flags((void **)&F, NX*sizeof(double), STARPU_MALLOC_COUNT);
+
+	FPRINTF(stderr, "TEST DISK MEMORY \n");
+
+	unsigned int j;
+	/* initialization with bad values */
+	for(j = 0; j < NX; ++j)
+	{
+		A[j] = j;
+		F[j] = -j;
+	}
+
+	starpu_data_handle_t vector_handleA, vector_handleB, vector_handleC, vector_handleD, vector_handleE, vector_handleF;
+
+	/* register vector in starpu */
+	starpu_vector_data_register(&vector_handleA, STARPU_MAIN_RAM, (uintptr_t)A, NX, sizeof(double));
+	starpu_vector_data_register(&vector_handleB, -1, (uintptr_t) NULL, NX, sizeof(double));	
+	starpu_vector_data_register(&vector_handleC, -1, (uintptr_t) NULL, NX, sizeof(double));
+	starpu_vector_data_register(&vector_handleD, -1, (uintptr_t) NULL, NX, sizeof(double));
+	starpu_vector_data_register(&vector_handleE, -1, (uintptr_t) NULL, NX, sizeof(double));
+	starpu_vector_data_register(&vector_handleF, STARPU_MAIN_RAM, (uintptr_t)F, NX, sizeof(double));
+
+	/* copy vector A->B, B->C... */
+	starpu_data_cpy(vector_handleB, vector_handleA, 0, NULL, NULL);
+	starpu_data_cpy(vector_handleC, vector_handleB, 0, NULL, NULL);
+	starpu_data_cpy(vector_handleD, vector_handleC, 0, NULL, NULL);
+	starpu_data_cpy(vector_handleE, vector_handleD, 0, NULL, NULL);
+	starpu_data_cpy(vector_handleF, vector_handleE, 0, NULL, NULL);
+
+	/* StarPU does not need to manipulate the array anymore so we can stop
+ 	 * monitoring it */
+
+	/* free them */
+	starpu_data_unregister(vector_handleA);
+	starpu_data_unregister(vector_handleB);
+	starpu_data_unregister(vector_handleC);
+	starpu_data_unregister(vector_handleD);
+	starpu_data_unregister(vector_handleE);
+	starpu_data_unregister(vector_handleF);
+
+	/* check if computation is correct */
+	int try = 1;
+	for (j = 0; j < NX; ++j)
+		if (A[j] != F[j])
+		{
+			printf("Fail A %f != F %f \n", A[j], F[j]);
+			try = 0;
+		}
+
+	/* free last vectors */
+	starpu_free_flags(A, NX*sizeof(double), STARPU_MALLOC_COUNT);
+	starpu_free_flags(F, NX*sizeof(double), STARPU_MALLOC_COUNT);
+
+	/* terminate StarPU, no task can be submitted after */
+	starpu_shutdown();
+
+	if(try)
+		FPRINTF(stderr, "TEST SUCCESS\n");
+	else
+		FPRINTF(stderr, "TEST FAIL\n");
+	return (try ? EXIT_SUCCESS : EXIT_FAILURE);
+
+enodev:
+	return 77;
+enoent:
+	return 77;
+}
+
+//! [To be included]

+ 1 - 1
doc/doxygen/chapters/code/multiformat.c

@@ -57,5 +57,5 @@ struct starpu_multiformat_data_interface_ops format_ops = {
     ...
 };
 
-starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
+starpu_multiformat_data_register(handle, STARPU_MAIN_RAM, &array_of_structs, NX, &format_ops);
 //! [To be included]

+ 2 - 2
doc/doxygen/chapters/code/vector_scal_c.c

@@ -79,14 +79,14 @@ int main(int argc, char **argv)
      *  - the first argument of the registration method is a pointer to the
      *    handle that should describe the data
      *  - the second argument is the memory node where the data (ie. "vector")
-     *    resides initially: 0 stands for an address in main memory, as
+     *    resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
      *    opposed to an adress on a GPU for instance.
      *  - the third argument is the adress of the vector in RAM
      *  - the fourth argument is the number of elements in the vector
      *  - the fifth argument is the size of each element.
      */
     starpu_data_handle_t vector_handle;
-    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
+    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
                                 NX, sizeof(vector[0]));
 
     float factor = 3.14;

+ 7 - 7
doc/doxygen/chapters/configure_options.doxy

@@ -22,13 +22,6 @@ the following configure options.
 Enable debugging messages.
 </dd>
 
-<dt>--enable-debug</dt>
-<dd>
-\anchor enable-debug
-\addindex __configure__--enable-debug
-Enable debugging messages.
-</dd>
-
 <dt>--enable-fast</dt>
 <dd>
 \anchor enable-fast
@@ -363,6 +356,13 @@ Enable performance debugging through gprof.
 Enable performance model debugging.
 </dd>
 
+<dt>--enable-fxt-lock</dt>
+<dd>
+\anchor enable-fxt-lock
+\addindex __configure__--enable-fxt-lock
+Enable additional trace events which describes locks behaviour.
+</dd>
+
 <dt>--enable-stats</dt>
 <dd>
 \anchor enable-stats

+ 5 - 5
doc/doxygen/chapters/environment_variables.doxy

@@ -217,12 +217,12 @@ it is therefore necessary to disable asynchronous data transfers.
 Disable asynchronous copies between CPU and MIC devices.
 </dd>
 
-<dt>STARPU_DISABLE_CUDA_GPU_GPU_DIRECT</dt>
+<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
 <dd>
-\anchor STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
-\addindex __env__STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
-Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
-instead. This permits to test the performance effect of GPU-Direct.
+\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+Enable direct CUDA transfers from GPU to GPU, without copying through RAM.
+This permits to test the performance effect of GPU-Direct.
 </dd>
 
 </dl>

+ 1 - 0
doc/doxygen/chapters/files.doxy

@@ -12,6 +12,7 @@
 \file starpu.h
 \file starpu_data_filters.h
 \file starpu_data_interfaces.h
+\file starpu_disk.h
 \file starpu_worker.h
 \file starpu_task.h
 \file starpu_task_bundle.h

+ 2 - 1
doc/doxygen/chapters/introduction.doxy

@@ -145,7 +145,7 @@ simply replace calling the function with submitting a task.
 A \b codelet records pointers to various implementations of the same
 theoretical function.
 
-A <b>memory node</b> can be either the main RAM or GPU-embedded memory.
+A <b>memory node</b> can be either the main RAM, GPU-embedded memory or a disk memory.
 
 A \b bus is a link between memory nodes.
 
@@ -213,6 +213,7 @@ The documentation chapters include
 <li> \ref HowToOptimizePerformanceWithStarPU
 <li> \ref PerformanceFeedback
 <li> \ref TipsAndTricksToKnowAbout
+<li> \ref OutOfCore
 <li> \ref MPISupport
 <li> \ref FFTSupport
 <li> \ref MICSCCSupport

+ 3 - 3
doc/doxygen/chapters/mpi_support.doxy

@@ -52,7 +52,7 @@ int main(int argc, char **argv)
     starpu_init(NULL);
     starpu_mpi_initialize_extended(&rank, &size);
 
-    starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+    starpu_vector_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, 1, sizeof(unsigned));
 
     unsigned nloops = NITER;
     unsigned loop;
@@ -273,7 +273,7 @@ data which will be needed by the tasks that we will execute.
             int mpi_rank = my_distrib(x, y, size);
              if (mpi_rank == my_rank)
                 /* Owning data */
-                starpu_variable_data_register(&data_handles[x][y], 0,
+                starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM,
                                               (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
             else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
                   || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
@@ -339,7 +339,7 @@ for(x = 0; x < nblocks ;  x++)
 {
     int mpi_rank = my_distrib(x, nodes);
     if (rank == root) {
-        starpu_vector_data_register(&data_handles[x], 0, (uintptr_t)vector[x],
+        starpu_vector_data_register(&data_handles[x], STARPU_MAIN_RAM, (uintptr_t)vector[x],
                                     blocks_size, sizeof(float));
     }
     else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1))) {

+ 17 - 13
doc/doxygen/chapters/optimize_performance.doxy

@@ -198,28 +198,32 @@ option lists the available performance models, and the <c>-s</c> option permits
 to choose the performance model to be displayed. The result looks like:
 
 \verbatim
-$ starpu_perfmodel_display -s starpu_dlu_lu_model_22
-performance model for cpu
-# hash    size     mean          dev           n
-880805ba  98304    2.731309e+02  6.010210e+01  1240
-b50b6605  393216   1.469926e+03  1.088828e+02  1240
-5c6c3401  1572864  1.125983e+04  3.265296e+03  1240
+$ starpu_perfmodel_display -s starpu_slu_lu_model_11
+performance model for cpu_impl_0
+# hash    size     flops         mean          dev           n
+914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
+3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
+e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
+...
 \endverbatim
 
-Which shows that for the LU 22 kernel with a 1.5MiB matrix, the average
-execution time on CPUs was about 11ms, with a 3ms standard deviation, over
-1240 samples. It is a good idea to check this before doing actual performance
+Which shows that for the LU 11 kernel with a 1MiB matrix, the average
+execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
+8 samples. It is a good idea to check this before doing actual performance
 measurements.
 
 A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
 
 \verbatim
-$ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
-98304 393216 1572864
-$ gnuplot starpu_starpu_dlu_lu_model_22.gp
-$ gv starpu_starpu_dlu_lu_model_22.eps
+$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
+4096 16384 65536 262144 1048576 4194304 
+$ gnuplot starpu_starpu_slu_lu_model_11.gp
+$ gv starpu_starpu_slu_lu_model_11.eps
 \endverbatim
 
+\image html starpu_starpu_slu_lu_model_11.png
+\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
+
 If a kernel source code was modified (e.g. performance improvement), the
 calibration information is stale and should be dropped, to re-calibrate from
 start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.

+ 56 - 0
doc/doxygen/chapters/out_of_core.doxy

@@ -0,0 +1,56 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2013 Corentin Salingue
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page OutOfCore Out Of Core
+
+\section Introduction Introduction
+
+When using StarPU, one may need to store more data than what the main memory
+(RAM) can store. This part describes the method to add a new memory node on a
+disk and to use it.
+
+\section UseANewDiskMemory Use a new disk memory
+
+To use a disk memory node, you have to register it with this function:
+
+\code{.c}
+	int new_dd = starpu_disk_register(&starpu_disk_stdio_ops, (void *) "/tmp/", 1024*1024*200);
+\endcode
+
+Here, we use the stdio library to realize the read/write operations, i.e.
+fread/fwrite. This structure must have a path where to store files, as well as
+the maximum size the software can afford storing on the disk.
+
+Don't forget to check if the result is correct!
+
+When the register function is called, StarPU will benchmark the disk. This can
+take some time.
+
+<strong>Warning: the size thus has to be at least 1 MB!</strong> 
+
+StarPU will automatically try to evict unused data to this new disk. One can
+also use the standard StarPU node API, see the \ref API_Standard_Memory_Library
+and the \ref API_Data_Interfaces .
+
+The disk is unregistered during the starpu_shutdown().
+
+\section DiskFunctions Disk functions
+
+There are various ways to operate a disk memory node, described by the structure
+starpu_disk_ops. For instance, the variable #starpu_disk_stdio_ops
+uses fread/fwrite functions.
+
+All structures are in \ref API_Out_Of_Core .
+
+\section ExampleDiskCopy Examples: disk_copy
+
+\snippet disk_copy.c To be included
+
+\section ExampleDiskCompute Examples: disk_compute
+
+\snippet disk_compute.c To be included
+
+*/

+ 7 - 0
doc/doxygen/chapters/performance_feedback.doxy

@@ -253,6 +253,10 @@ starpu_shutdown(). The trace is a binary file whose name has the form
 <c>/tmp/</c> directory by default, or by the directory specified by
 the environment variable \ref STARPU_FXT_PREFIX.
 
+The additional configure option \ref enable-fxt-lock "--enable-fxt-lock" can 
+be used to generate trace events which describes the locks behaviour during 
+the execution.
+
 \subsection CreatingAGanttDiagram Creating a Gantt Diagram
 
 When the FxT trace file <c>filename</c> has been generated, it is possible to
@@ -416,6 +420,9 @@ The tool <c>starpu_perfmodel_plot</c> can be used to draw performance
 models. It writes a <c>.gp</c> file in the current directory, to be
 run with the tool <c>gnuplot</c>, which shows the corresponding curve.
 
+\image html starpu_non_linear_memset_regression_based.png
+\image latex starpu_non_linear_memset_regression_based.eps "" width=\textwidth
+
 When the field starpu_task::flops is set, <c>starpu_perfmodel_plot</c> can
 directly draw a GFlops curve, by simply adding the <c>-f</c> option:
 

文件差異過大導致無法顯示
+ 1138 - 0
doc/doxygen/chapters/starpu_non_linear_memset_regression_based.eps


二進制
doc/doxygen/chapters/starpu_non_linear_memset_regression_based.pdf


二進制
doc/doxygen/chapters/starpu_non_linear_memset_regression_based.png


文件差異過大導致無法顯示
+ 1036 - 0
doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.eps


二進制
doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.pdf


二進制
doc/doxygen/chapters/starpu_starpu_slu_lu_model_11.png


文件差異過大導致無法顯示
+ 1253 - 0
doc/doxygen/chapters/tasks_size_overhead.eps


二進制
doc/doxygen/chapters/tasks_size_overhead.pdf


二進制
doc/doxygen/chapters/tasks_size_overhead.png


+ 2 - 0
doc/doxygen/doxygen-config.cfg.in

@@ -25,6 +25,7 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu_data_filters.h \
 			 @top_srcdir@/include/starpu_data.h \
 			 @top_srcdir@/include/starpu_data_interfaces.h \
+			 @top_srcdir@/include/starpu_disk.h \
 			 @top_srcdir@/include/starpu_deprecated_api.h \
 			 @top_srcdir@/include/starpu_driver.h \
 			 @top_srcdir@/include/starpu_expert.h \
@@ -62,3 +63,4 @@ INPUT_FILTER           = @top_builddir@/doc/doxygen/doxygen_filter.sh
 
 LATEX_HEADER           = @top_srcdir@/doc/doxygen/refman.tex
 
+IMAGE_PATH             = @top_srcdir@/doc/doxygen/chapters

+ 2 - 1
doc/doxygen/doxygen.cfg

@@ -774,7 +774,8 @@ EXAMPLE_RECURSIVE      = NO
 # directories that contain image that are included in the documentation (see
 # the \image command).
 
-IMAGE_PATH             =
+# From @INCLUDE, above
+#IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program

+ 9 - 0
doc/doxygen/refman.tex

@@ -129,6 +129,13 @@ Documentation License”.
 \hypertarget{TipsAndTricksToKnowAbout}{}
 \input{TipsAndTricksToKnowAbout}
 
+\chapter{Out Of Core}
+\label{OutOfCore}
+\hypertarget{OutOfCore}{}
+\input{OutOfCore}
+
+
+
 \chapter{MPI Support}
 \label{MPISupport}
 \hypertarget{MPISupport}{}
@@ -190,6 +197,7 @@ Documentation License”.
 \input{group__API__Data__Management}
 \input{group__API__Data__Interfaces}
 \input{group__API__Data__Partition}
+\input{group__API__Out__Of__Core}
 \input{group__API__Multiformat__Data__Interface}
 \input{group__API__Codelet__And__Tasks}
 \input{group__API__Insert__Task}
@@ -232,6 +240,7 @@ Documentation License”.
 \input{starpu__data__filters_8h}
 \input{starpu__data__interfaces_8h}
 \input{starpu__deprecated__api_8h}
+\input{starpu__disk_8h}
 \input{starpu__driver_8h}
 \input{starpu__expert_8h}
 \input{starpu__fxt_8h}

+ 3 - 3
doc/texinfo/chapters/advanced-examples.texi

@@ -235,7 +235,7 @@ int vector[NX];
 starpu_data_handle_t handle;
 
 /* Declare data to StarPU */
-starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
+starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
                             NX, sizeof(vector[0]));
 
 /* Partition the vector in PARTS sub-vectors */
@@ -1002,7 +1002,7 @@ struct starpu_multiformat_data_interface_ops format_ops = @{
     .cpu_elemsize = 2 * sizeof(float),
     ...
 @};
-starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
+starpu_multiformat_data_register(handle, STARPU_MAIN_RAM, &array_of_structs, NX, &format_ops);
 @end smallexample
 @end cartouche
 
@@ -1248,7 +1248,7 @@ Complex data interfaces can then be registered to StarPU.
 @smallexample
 double real = 45.0;
 double imaginary = 12.0;
-starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1);
+starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
 starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
 @end smallexample
 @end cartouche

+ 6 - 6
doc/texinfo/chapters/api.texi

@@ -783,7 +783,7 @@ item.
 @smallexample
 float var;
 starpu_data_handle_t var_handle;
-starpu_variable_data_register(&var_handle, 0, (uintptr_t)&var, sizeof(var));
+starpu_variable_data_register(&var_handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
 @end smallexample
 @end cartouche
 @end deftypefun
@@ -796,7 +796,7 @@ Register the @var{nx} @var{elemsize}-byte elements pointed to by
 @smallexample
 float vector[NX];
 starpu_data_handle_t vector_handle;
-starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX,
                             sizeof(vector[0]));
 @end smallexample
 @end cartouche
@@ -814,7 +814,7 @@ alignment purposes.
 float *matrix;
 starpu_data_handle_t matrix_handle;
 matrix = (float*)malloc(width * height * sizeof(float));
-starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix,
+starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix,
                             width, width, height, sizeof(float));
 @end smallexample
 @end cartouche
@@ -831,7 +831,7 @@ between rows and between z planes.
 float *block;
 starpu_data_handle_t block_handle;
 block = (float*)malloc(nx*ny*nz*sizeof(float));
-starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
+starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block,
                            nx, nx*ny, nx, ny, nz, sizeof(float));
 @end smallexample
 @end cartouche
@@ -1584,11 +1584,11 @@ starpu_data_partition(A_handle, &f);
 
 @deftypefun void starpu_data_unpartition (starpu_data_handle_t @var{root_data}, unsigned @var{gathering_node})
 This unapplies one filter, thus unpartitioning the data. The pieces of data are
-collected back into one big piece in the @var{gathering_node} (usually 0). Tasks
+collected back into one big piece in the @var{gathering_node} (usually STARPU_MAIN_RAM). Tasks
 working on the partitioned data must be already finished when calling @code{starpu_data_unpartition}.
 @cartouche
 @smallexample
-starpu_data_unpartition(A_handle, 0);
+starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
 @end smallexample
 @end cartouche
 @end deftypefun

+ 2 - 2
doc/texinfo/chapters/basic-examples.texi

@@ -584,7 +584,7 @@ The following lines show how to declare an array of @code{NX} elements of type
 float vector[NX];
 
 starpu_data_handle_t vector_handle;
-starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX,
                             sizeof(vector[0]));
 @end smallexample
 @end cartouche
@@ -860,7 +860,7 @@ int main(int argc, char **argv)
 @cartouche
 @smallexample
     /* @b{Registering data within StarPU} */
-    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
+    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
                                 NX, sizeof(vector[0]));
 
     /* @b{Definition of the task} */

+ 3 - 3
doc/texinfo/chapters/configuration.texi

@@ -477,9 +477,9 @@ it is therefore necessary to disable asynchronous data transfers.
 Disable asynchronous copies between CPU and MIC devices.
 @end defvr
 
-@defvr {Environment variable} STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
-Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
-instead. This permits to test the performance effect of GPU-Direct.
+@defvr {Environment variable} STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+Enable direct CUDA transfers from GPU to GPU, without copying through RAM.
+This permits to test the performance effect of GPU-Direct.
 @end defvr
 
 @node Scheduling

+ 3 - 3
doc/texinfo/chapters/mpi-support.texi

@@ -64,7 +64,7 @@ int main(int argc, char **argv)
     starpu_init(NULL);
     starpu_mpi_initialize_extended(&rank, &size);
 
-    starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+    starpu_vector_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, 1, sizeof(unsigned));
 
     unsigned nloops = NITER;
     unsigned loop;
@@ -310,7 +310,7 @@ data which will be needed by the tasks that we will execute.
             int mpi_rank = my_distrib(x, y, size);
              if (mpi_rank == my_rank)
                 /* Owning data */
-                starpu_variable_data_register(&data_handles[x][y], 0,
+                starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM,
                                               (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
             else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
                   || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
@@ -381,7 +381,7 @@ for(x = 0; x < nblocks ;  x++)
 @{
     int mpi_rank = my_distrib(x, nodes);
     if (rank == root) @{
-        starpu_vector_data_register(&data_handles[x], 0, (uintptr_t)vector[x],
+        starpu_vector_data_register(&data_handles[x], STARPU_MAIN_RAM, (uintptr_t)vector[x],
                                     blocks_size, sizeof(float));
     @}
     else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1))) @{

+ 3 - 3
doc/texinfo/chapters/vector_scal_c.texi

@@ -2,7 +2,7 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c See the file starpu.texi for copying conditions.
 
 @smallexample
@@ -69,14 +69,14 @@ int main(int argc, char **argv)
      *  - the first argument of the registration method is a pointer to the
      *    handle that should describe the data
      *  - the second argument is the memory node where the data (ie. "vector")
-     *    resides initially: 0 stands for an address in main memory, as
+     *    resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
      *    opposed to an adress on a GPU for instance.
      *  - the third argument is the adress of the vector in RAM
      *  - the fourth argument is the number of elements in the vector
      *  - the fifth argument is the size of each element.
      */
     starpu_data_handle_t vector_handle;
-    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
+    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
                                 NX, sizeof(vector[0]));
 
     float factor = 3.14;

+ 2 - 2
doc/tutorial/vector_scal.c

@@ -75,14 +75,14 @@ int main(int argc, char **argv)
 	 *  - the first argument of the registration method is a pointer to the
 	 *    handle that should describe the data
 	 *  - the second argument is the memory node where the data (ie. "vector")
-	 *    resides initially: 0 stands for an address in main memory, as
+	 *    resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
 	 *    opposed to an adress on a GPU for instance.
 	 *  - the third argument is the adress of the vector in RAM
 	 *  - the fourth argument is the number of elements in the vector
 	 *  - the fifth argument is the size of each element.
 	 */
 	starpu_data_handle_t vector_handle;
-	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
+	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
 				    NX, sizeof(vector[0]));
 
 	float factor = 3.14;

+ 2 - 2
examples/audio/starpu_audio_processing.c

@@ -413,7 +413,7 @@ int main(int argc, char **argv)
 
 	starpu_cublas_init();
 
-	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
+	starpu_vector_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, niter*nsamples, sizeof(float));
 
 	struct starpu_data_filter f =
 	{
@@ -458,7 +458,7 @@ int main(int argc, char **argv)
 		fprintf(stderr, "Writing output data\n");
 
 	/* make sure that the output is in RAM before quitting StarPU */
-	starpu_data_unpartition(A_handle, 0);
+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
 	starpu_data_unregister(A_handle);
 
 	starpu_cublas_shutdown();

+ 4 - 4
examples/axpy/axpy.c

@@ -152,8 +152,8 @@ int main(int argc, char **argv)
 	FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", _vec_y[0]);
 
 	/* Declare the data to StarPU */
-	starpu_vector_data_register(&_handle_x, 0, (uintptr_t)_vec_x, N, sizeof(TYPE));
-	starpu_vector_data_register(&_handle_y, 0, (uintptr_t)_vec_y, N, sizeof(TYPE));
+	starpu_vector_data_register(&_handle_x, STARPU_MAIN_RAM, (uintptr_t)_vec_x, N, sizeof(TYPE));
+	starpu_vector_data_register(&_handle_y, STARPU_MAIN_RAM, (uintptr_t)_vec_y, N, sizeof(TYPE));
 
 	/* Divide the vector into blocks */
 	struct starpu_data_filter block_filter =
@@ -194,8 +194,8 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 
 enodev:
-	starpu_data_unpartition(_handle_x, 0);
-	starpu_data_unpartition(_handle_y, 0);
+	starpu_data_unpartition(_handle_x, STARPU_MAIN_RAM);
+	starpu_data_unpartition(_handle_y, STARPU_MAIN_RAM);
 	starpu_data_unregister(_handle_x);
 	starpu_data_unregister(_handle_y);
 

+ 1 - 1
examples/basic_examples/block.c

@@ -37,7 +37,7 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
 	starpu_data_handle_t block_handle;
         int i;
 
-	starpu_block_data_register(&block_handle, 0, (uintptr_t)block, pnx, pnx*pny, pnx, pny, pnz, sizeof(float));
+	starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, pnx, pnx*pny, pnx, pny, pnz, sizeof(float));
 
 	starpu_codelet_init(&cl);
 	cl.where = where;

+ 1 - 1
examples/basic_examples/dynamic_handles.c

@@ -83,7 +83,7 @@ int main(int argc, char **argv)
 	for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
 	     dummy_big_cl.dyn_modes[i] = STARPU_RW;
 
-	starpu_variable_data_register(&handle, 0, (uintptr_t)&val, sizeof(int));
+	starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&val, sizeof(int));
 
 	task = starpu_task_create();
 	task->synchronous = 1;

+ 6 - 6
examples/basic_examples/mult.c

@@ -174,11 +174,11 @@ static void partition_mult_data(void)
 	 * node in which resides the matrix: 0 means that the 3rd argument is
 	 * an adress in main memory.
 	 */
-	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A, 
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, 
 		ydim, ydim, zdim, sizeof(float));
-	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B, 
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B, 
 		zdim, zdim, xdim, sizeof(float));
-	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, 
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C, 
 		ydim, ydim, xdim, sizeof(float));
 
 	/* A filter is a method to partition a data into disjoint chunks, it is
@@ -365,9 +365,9 @@ int main(STARPU_ATTRIBUTE_UNUSED int argc,
 	 * starpu_data_map_filters is called again on C_handle.
 	 * The second argument is the memory node where the different subsets
 	 * should be reassembled, 0 = main memory (RAM) */
-	starpu_data_unpartition(A_handle, 0);
-	starpu_data_unpartition(B_handle, 0);
-	starpu_data_unpartition(C_handle, 0);
+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
 
 	/* stop monitoring matrix C : after this, it is not possible to pass C 
 	 * (or any subset of C) as a codelet input/output. This also implements

+ 1 - 1
examples/basic_examples/multiformat.c

@@ -126,7 +126,7 @@ static void
 register_data(void)
 {
 	starpu_multiformat_data_register(&array_of_structs_handle,
-					 0,
+					 STARPU_MAIN_RAM,
 					 &array_of_structs,
 					 N_ELEMENTS,
 					 &format_ops);

+ 1 - 1
examples/basic_examples/variable.c

@@ -50,7 +50,7 @@ int main(int argc, char **argv)
         if (argc == 2) niter = atoi(argv[1]);
         foo = 0.0f;
 
-	starpu_variable_data_register(&float_array_handle, 0 /* home node */,
+	starpu_variable_data_register(&float_array_handle, STARPU_MAIN_RAM /* home node */,
                                       (uintptr_t)&foo, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL

+ 2 - 2
examples/basic_examples/vector_scal.c

@@ -141,14 +141,14 @@ int main(int argc, char **argv)
 	 *  - the first argument of the registration method is a pointer to the
 	 *    handle that should describe the data
 	 *  - the second argument is the memory node where the data (ie. "vector")
-	 *    resides initially: 0 stands for an address in main memory, as
+	 *    resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
 	 *    opposed to an adress on a GPU for instance.
 	 *  - the third argument is the adress of the vector in RAM
 	 *  - the fourth argument is the number of elements in the vector
 	 *  - the fifth argument is the size of each element.
 	 */
 	starpu_data_handle_t vector_handle;
-	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 	float factor = 3.14;
 

+ 3 - 3
examples/basic_examples/vector_scal_c.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -69,14 +69,14 @@ int compute_(int *F_NX, float *vector)
 	 *  - the first argument of the registration method is a pointer to the
 	 *    handle that should describe the data
 	 *  - the second argument is the memory node where the data (ie. "vector")
-	 *    resides initially: 0 stands for an address in main memory, as
+	 *    resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
 	 *    opposed to an adress on a GPU for instance.
 	 *  - the third argument is the adress of the vector in RAM
 	 *  - the fourth argument is the number of elements in the vector
 	 *  - the fifth argument is the size of each element.
 	 */
 	starpu_data_handle_t vector_handle;
-	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 	float factor = 3.14;
 

+ 1 - 1
examples/binary/binary.c

@@ -42,7 +42,7 @@ int compute(char *file_name, int load_as_file)
 	int ret = 0;
 	unsigned niter = 500;
 
-	starpu_vector_data_register(&float_array_handle, 0, (uintptr_t)&float_array, 4, sizeof(float));
+	starpu_vector_data_register(&float_array_handle, STARPU_MAIN_RAM, (uintptr_t)&float_array, 4, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL
 	if (load_as_file)

+ 1 - 1
examples/callback/callback.c

@@ -59,7 +59,7 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_variable_data_register(&handle, 0, (uintptr_t)&v, sizeof(int));
+	starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&v, sizeof(int));
 
 	struct starpu_task *task = starpu_task_create();
 	task->cl = &cl;

+ 14 - 14
examples/cg/cg.c

@@ -137,16 +137,16 @@ static void free_data(void)
 
 static void register_data(void)
 {
-	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A, n, n, n, sizeof(TYPE));
-	starpu_vector_data_register(&b_handle, 0, (uintptr_t)b, n, sizeof(TYPE));
-	starpu_vector_data_register(&x_handle, 0, (uintptr_t)x, n, sizeof(TYPE));
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, n, n, n, sizeof(TYPE));
+	starpu_vector_data_register(&b_handle, STARPU_MAIN_RAM, (uintptr_t)b, n, sizeof(TYPE));
+	starpu_vector_data_register(&x_handle, STARPU_MAIN_RAM, (uintptr_t)x, n, sizeof(TYPE));
 
-	starpu_vector_data_register(&r_handle, 0, (uintptr_t)r, n, sizeof(TYPE));
-	starpu_vector_data_register(&d_handle, 0, (uintptr_t)d, n, sizeof(TYPE));
-	starpu_vector_data_register(&q_handle, 0, (uintptr_t)q, n, sizeof(TYPE));
+	starpu_vector_data_register(&r_handle, STARPU_MAIN_RAM, (uintptr_t)r, n, sizeof(TYPE));
+	starpu_vector_data_register(&d_handle, STARPU_MAIN_RAM, (uintptr_t)d, n, sizeof(TYPE));
+	starpu_vector_data_register(&q_handle, STARPU_MAIN_RAM, (uintptr_t)q, n, sizeof(TYPE));
 
-	starpu_variable_data_register(&dtq_handle, 0, (uintptr_t)&dtq, sizeof(TYPE));
-	starpu_variable_data_register(&rtr_handle, 0, (uintptr_t)&rtr, sizeof(TYPE));
+	starpu_variable_data_register(&dtq_handle, STARPU_MAIN_RAM, (uintptr_t)&dtq, sizeof(TYPE));
+	starpu_variable_data_register(&rtr_handle, STARPU_MAIN_RAM, (uintptr_t)&rtr, sizeof(TYPE));
 
 	if (use_reduction)
 	{
@@ -160,13 +160,13 @@ static void register_data(void)
 
 static void unregister_data(void)
 {
-	starpu_data_unpartition(A_handle, 0);
-	starpu_data_unpartition(b_handle, 0);
-	starpu_data_unpartition(x_handle, 0);
+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(b_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(x_handle, STARPU_MAIN_RAM);
 
-	starpu_data_unpartition(r_handle, 0);
-	starpu_data_unpartition(d_handle, 0);
-	starpu_data_unpartition(q_handle, 0);
+	starpu_data_unpartition(r_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(d_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(q_handle, STARPU_MAIN_RAM);
 
 	starpu_data_unregister(A_handle);
 	starpu_data_unregister(b_handle);

+ 3 - 3
examples/cholesky/cholesky_grain_tag.c

@@ -189,7 +189,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
@@ -250,7 +250,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	{
 		/* stall the application until the end of computations */
 		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
-		starpu_data_unpartition(dataA, 0);
+		starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 		starpu_data_unregister(dataA);
 		return 0;
 	}
@@ -274,7 +274,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 		free(tag_array);
 
-		starpu_data_unpartition(dataA, 0);
+		starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 		starpu_data_unregister(dataA);
 
 		float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)];

+ 2 - 2
examples/cholesky/cholesky_implicit.c

@@ -183,7 +183,7 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 	struct starpu_data_filter f =
 	{
@@ -201,7 +201,7 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	int ret = _cholesky(dataA, nblocks);
 
-	starpu_data_unpartition(dataA, 0);
+	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 	starpu_data_unregister(dataA);
 
 	return ret;

+ 3 - 3
examples/cholesky/cholesky_tag.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -234,7 +234,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
 
-	starpu_data_unpartition(dataA, 0);
+	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 
 	end = starpu_timing_now();
 
@@ -279,7 +279,7 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 

+ 1 - 1
examples/cholesky/cholesky_tile_tag.c

@@ -301,7 +301,7 @@ int main(int argc, char **argv)
 	{
 		if (x <= y)
 		{
-			starpu_matrix_data_register(&A_state[y][x], 0, (uintptr_t)A[y][x], 
+			starpu_matrix_data_register(&A_state[y][x], STARPU_MAIN_RAM, (uintptr_t)A[y][x], 
 				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
 		}
 	}

+ 1 - 1
examples/cpp/incrementer_cpp.cpp

@@ -50,7 +50,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_vector_data_register(&float_array_handle, 0, (uintptr_t)&float_array, 4, sizeof(float));
+	starpu_vector_data_register(&float_array_handle, STARPU_MAIN_RAM, (uintptr_t)&float_array, 4, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL
         ret = starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);

+ 3 - 3
examples/filters/custom_mf/custom_interface.c

@@ -230,7 +230,7 @@ static size_t custom_interface_get_size(starpu_data_handle_t handle)
 	struct custom_data_interface *data_interface;
 
 	data_interface = (struct custom_data_interface *)
-				starpu_data_get_interface_on_node(handle, 0);
+				starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 	size = data_interface->nx * data_interface->ops->cpu_elemsize;
 	return size;
 }
@@ -243,7 +243,7 @@ static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle)
 static void display_custom_interface(starpu_data_handle_t handle, FILE *f)
 {
 	struct custom_data_interface *ci = (struct custom_data_interface *)
-		starpu_data_get_interface_on_node(handle, 0);
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 	fprintf(f, "Custom interface of size %d", ci->nx);
 }
 
@@ -252,7 +252,7 @@ custom_get_nx(starpu_data_handle_t handle)
 {
 	struct custom_data_interface *data_interface;
 	data_interface = (struct custom_data_interface *)
-				starpu_data_get_interface_on_node(handle, 0);
+				starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 	return data_interface->nx;
 }
 

+ 2 - 2
examples/filters/custom_mf/custom_mf_filter.c

@@ -110,7 +110,7 @@ register_and_partition_data(void)
 		_array_of_structs[i].x = i+1.0;
 		_array_of_structs[i].y = 42.0;
 	}
-	custom_data_register(&_handle, 0, &_array_of_structs, N, &format_ops);
+	custom_data_register(&_handle, STARPU_MAIN_RAM, &_array_of_structs, N, &format_ops);
 
 	struct starpu_data_filter f =
 	{
@@ -125,7 +125,7 @@ register_and_partition_data(void)
 static void
 unpartition_and_unregister_data(void)
 {
-	starpu_data_unpartition(_handle, 0);
+	starpu_data_unpartition(_handle, STARPU_MAIN_RAM);
 	starpu_data_unregister(_handle);
 }
 

+ 2 - 2
examples/filters/fblock.c

@@ -115,7 +115,7 @@ int main(int argc, char **argv)
 #endif
 
         /* Declare data to StarPU */
-        starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
+        starpu_block_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
         FPRINTF(stderr, "IN  Block\n");
         print_data(handle);
 
@@ -159,7 +159,7 @@ int main(int argc, char **argv)
         }
 
         /* Unpartition the data, unregister it from StarPU and shutdown */
-        starpu_data_unpartition(handle, 0);
+        starpu_data_unpartition(handle, STARPU_MAIN_RAM);
         print_data(handle);
         starpu_data_unregister(handle);
 

+ 2 - 2
examples/filters/fmatrix.c

@@ -75,7 +75,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Declare data to StarPU */
-	starpu_matrix_data_register(&handle, 0, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
+	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
 
         /* Partition the matrix in PARTS sub-matrices */
 	struct starpu_data_filter f =
@@ -102,7 +102,7 @@ int main(int argc, char **argv)
 	}
 
         /* Unpartition the data, unregister it from StarPU and shutdown */
-	starpu_data_unpartition(handle, 0);
+	starpu_data_unpartition(handle, STARPU_MAIN_RAM);
         starpu_data_unregister(handle);
 	starpu_shutdown();
 

+ 2 - 2
examples/filters/fvector.c

@@ -63,7 +63,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Declare data to StarPU */
-	starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+	starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
         /* Partition the vector in PARTS sub-vectors */
 	struct starpu_data_filter f =
@@ -91,7 +91,7 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
-	starpu_data_unpartition(handle, 0);
+	starpu_data_unpartition(handle, STARPU_MAIN_RAM);
         starpu_data_unregister(handle);
 	starpu_shutdown();
 

+ 4 - 4
examples/filters/shadow.c

@@ -121,10 +121,10 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Declare source vector to StarPU */
-	starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX + 2*SHADOW, sizeof(vector[0]));
+	starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX + 2*SHADOW, sizeof(vector[0]));
 
 	/* Declare destination vector to StarPU */
-	starpu_vector_data_register(&handle2, 0, (uintptr_t)vector2, NX + PARTS*2*SHADOW, sizeof(vector[0]));
+	starpu_vector_data_register(&handle2, STARPU_MAIN_RAM, (uintptr_t)vector2, NX + PARTS*2*SHADOW, sizeof(vector[0]));
 
         /* Partition the source vector in PARTS sub-vectors with shadows */
 	/* NOTE: the resulting handles should only be used in read-only mode,
@@ -163,8 +163,8 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
-	starpu_data_unpartition(handle, 0);
-	starpu_data_unpartition(handle2, 0);
+	starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(handle2, STARPU_MAIN_RAM);
         starpu_data_unregister(handle);
         starpu_data_unregister(handle2);
 	starpu_shutdown();

+ 4 - 4
examples/filters/shadow2d.c

@@ -202,10 +202,10 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Declare source matrix to StarPU */
-	starpu_matrix_data_register(&handle, 0, (uintptr_t)matrix, NX + 2*SHADOWX, NX + 2*SHADOWX, NY + 2*SHADOWY, sizeof(matrix[0][0]));
+	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX + 2*SHADOWX, NX + 2*SHADOWX, NY + 2*SHADOWY, sizeof(matrix[0][0]));
 
 	/* Declare destination matrix to StarPU */
-	starpu_matrix_data_register(&handle2, 0, (uintptr_t)matrix2, NX + PARTSX*2*SHADOWX, NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, sizeof(matrix2[0][0]));
+	starpu_matrix_data_register(&handle2, STARPU_MAIN_RAM, (uintptr_t)matrix2, NX + PARTSX*2*SHADOWX, NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, sizeof(matrix2[0][0]));
 
         /* Partition the source matrix in PARTSY*PARTSX sub-matrices with shadows */
 	/* NOTE: the resulting handles should only be used in read-only mode,
@@ -258,8 +258,8 @@ int main(int argc, char **argv)
 		}
 	}
 
-	starpu_data_unpartition(handle, 0);
-	starpu_data_unpartition(handle2, 0);
+	starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(handle2, STARPU_MAIN_RAM);
         starpu_data_unregister(handle);
         starpu_data_unregister(handle2);
 	starpu_shutdown();

+ 4 - 4
examples/filters/shadow3d.c

@@ -214,13 +214,13 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Declare source matrix to StarPU */
-	starpu_block_data_register(&handle, 0, (uintptr_t)matrix,
+	starpu_block_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix,
 			NX + 2*SHADOWX, (NX + 2*SHADOWX) * (NY + 2*SHADOWY),
 			NX + 2*SHADOWX, NY + 2*SHADOWY, NZ + 2*SHADOWZ,
 			sizeof(matrix[0][0][0]));
 
 	/* Declare destination matrix to StarPU */
-	starpu_block_data_register(&handle2, 0, (uintptr_t)matrix2,
+	starpu_block_data_register(&handle2, STARPU_MAIN_RAM, (uintptr_t)matrix2,
 			NX + PARTSX*2*SHADOWX, (NX + PARTSX*2*SHADOWX) * (NY + PARTSY*2*SHADOWY),
 			NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, NZ + PARTSZ*2*SHADOWZ,
 			sizeof(matrix2[0][0][0]));
@@ -290,8 +290,8 @@ int main(int argc, char **argv)
 		}
 	}
 
-	starpu_data_unpartition(handle, 0);
-	starpu_data_unpartition(handle2, 0);
+	starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(handle2, STARPU_MAIN_RAM);
         starpu_data_unregister(handle);
         starpu_data_unregister(handle2);
 	starpu_shutdown();

+ 2 - 2
examples/heat/dw_factolu.c

@@ -750,7 +750,7 @@ void dw_factoLU(float *matA, unsigned size,
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, 
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, 
 			size, size, sizeof(float));
 
 	struct starpu_data_filter f =
@@ -779,7 +779,7 @@ void dw_factoLU(float *matA, unsigned size,
 	}
 
 	/* gather all the data */
-	starpu_data_unpartition(dataA, 0);
+	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 
 	starpu_data_unregister(dataA);
 

+ 3 - 3
examples/heat/dw_factolu_grain.c

@@ -213,7 +213,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 	 * (re)partition data
 	 */
 	starpu_data_handle_t dataA;
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 	STARPU_ASSERT((size % blocksize) == 0);
 	STARPU_ASSERT((inner_size % blocksize) == 0);
@@ -288,7 +288,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 	{
 		/* we wait for the last task and we are done */
 		starpu_tag_wait(TAG11(nblocks-1, tag_prefix));
-		starpu_data_unpartition(dataA, 0);		
+		starpu_data_unpartition(dataA, STARPU_MAIN_RAM);		
 		return;
 	}
 	else
@@ -312,7 +312,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
 		free(tag_array);
 
-		starpu_data_unpartition(dataA, 0);
+		starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 		starpu_data_unregister(dataA);
 
 		float *newmatA = &matA[inner_size*(ld+1)];

+ 3 - 3
examples/heat/dw_factolu_tag.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -297,7 +297,7 @@ void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 	struct starpu_data_filter f =
 	{
@@ -316,7 +316,7 @@ void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 	dw_codelet_facto_v3(dataA, nblocks);
 
 	/* gather all the data */
-	starpu_data_unpartition(dataA, 0);
+	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 
 	starpu_data_unregister(dataA);
 

+ 6 - 6
examples/heat/dw_sparse_cg.c

@@ -357,10 +357,10 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 	starpu_data_handle_t ds_vecr, ds_vecd, ds_vecq;
 
 	/* first the user-allocated data */
-	starpu_csr_data_register(&ds_matrixA, 0, nnz, nrow,
+	starpu_csr_data_register(&ds_matrixA, STARPU_MAIN_RAM, nnz, nrow,
 			(uintptr_t)nzvalA, colind, rowptr, 0, sizeof(float));
-	starpu_vector_data_register(&ds_vecx, 0, (uintptr_t)vecx, nrow, sizeof(float));
-	starpu_vector_data_register(&ds_vecb, 0, (uintptr_t)vecb, nrow, sizeof(float));
+	starpu_vector_data_register(&ds_vecx, STARPU_MAIN_RAM, (uintptr_t)vecx, nrow, sizeof(float));
+	starpu_vector_data_register(&ds_vecb, STARPU_MAIN_RAM, (uintptr_t)vecb, nrow, sizeof(float));
 
 	/* then allocate the algorithm intern data */
 	float *ptr_vecr, *ptr_vecd, *ptr_vecq;
@@ -380,9 +380,9 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 	FPRINTF(stdout, "nrow = %u \n", nrow);
 
 	/* and register them as well */
-	starpu_vector_data_register(&ds_vecr, 0, (uintptr_t)ptr_vecr, nrow, sizeof(float));
-	starpu_vector_data_register(&ds_vecd, 0, (uintptr_t)ptr_vecd, nrow, sizeof(float));
-	starpu_vector_data_register(&ds_vecq, 0, (uintptr_t)ptr_vecq, nrow, sizeof(float));
+	starpu_vector_data_register(&ds_vecr, STARPU_MAIN_RAM, (uintptr_t)ptr_vecr, nrow, sizeof(float));
+	starpu_vector_data_register(&ds_vecd, STARPU_MAIN_RAM, (uintptr_t)ptr_vecd, nrow, sizeof(float));
+	starpu_vector_data_register(&ds_vecq, STARPU_MAIN_RAM, (uintptr_t)ptr_vecq, nrow, sizeof(float));
 
 	/* we now have the complete problem */
 	struct cg_problem problem;

+ 1 - 1
examples/incrementer/incrementer.c

@@ -55,7 +55,7 @@ int main(int argc, char **argv)
 	float float_array[4] STARPU_ATTRIBUTE_ALIGNED(16) = { 0.0f, 0.0f, 0.0f, 0.0f};
 
 	starpu_data_handle_t float_array_handle;
-	starpu_vector_data_register(&float_array_handle, 0 /* home node */,
+	starpu_vector_data_register(&float_array_handle, STARPU_MAIN_RAM /* home node */,
 			(uintptr_t)&float_array, 4, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL

+ 2 - 2
examples/interface/complex.c

@@ -92,8 +92,8 @@ int main(int argc, char **argv)
 						  &opencl_program, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 #endif
-	starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1);
-	starpu_complex_data_register(&handle2, 0, &copy_real, &copy_imaginary, 1);
+	starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
+	starpu_complex_data_register(&handle2, STARPU_MAIN_RAM, &copy_real, &copy_imaginary, 1);
 
 	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
 	if (ret == -ENODEV) goto end;

+ 3 - 3
examples/interface/complex_interface.c

@@ -21,7 +21,7 @@
 double *starpu_complex_get_real(starpu_data_handle_t handle)
 {
 	struct starpu_complex_interface *complex_interface =
-		(struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
+		(struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return complex_interface->real;
 }
@@ -29,7 +29,7 @@ double *starpu_complex_get_real(starpu_data_handle_t handle)
 double *starpu_complex_get_imaginary(starpu_data_handle_t handle)
 {
 	struct starpu_complex_interface *complex_interface =
-		(struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
+		(struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return complex_interface->imaginary;
 }
@@ -37,7 +37,7 @@ double *starpu_complex_get_imaginary(starpu_data_handle_t handle)
 int starpu_complex_get_nx(starpu_data_handle_t handle)
 {
 	struct starpu_complex_interface *complex_interface =
-		(struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
+		(struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	return complex_interface->nx;
 }

+ 2 - 1
examples/lu/lu_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -96,6 +96,7 @@ static void parse_args(int argc, char **argv)
 		else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
 			fprintf(stderr,"usage: lu [-size n] [-nblocks b] [-piv] [-no-stride] [-profile] [-bound] [-bounddeps] [-bounddepsprio]\n");
+			fprintf(stderr,"Default is size %lu and nblocks %u\n", size, nblocks);
 			exit(0);
 		}
 	}

+ 2 - 2
examples/lu/xlu.c

@@ -249,7 +249,7 @@ int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 
 	/* We already enforce deps by hand */
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
@@ -271,7 +271,7 @@ int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned
 	int ret = dw_codelet_facto_v3(dataA, nblocks);
 
 	/* gather all the data */
-	starpu_data_unpartition(dataA, 0);
+	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 	starpu_data_unregister(dataA);
 
 	return ret;

+ 2 - 2
examples/lu/xlu_implicit.c

@@ -152,7 +152,7 @@ int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 
 	struct starpu_data_filter f =
 	{
@@ -171,7 +171,7 @@ int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned
 	int ret = dw_codelet_facto_v3(dataA, nblocks);
 
 	/* gather all the data */
-	starpu_data_unpartition(dataA, 0);
+	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 	starpu_data_unregister(dataA);
 	return ret;
 }

+ 3 - 3
examples/lu/xlu_implicit_pivot.c

@@ -206,7 +206,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 
 	struct starpu_data_filter f =
 	{
@@ -246,7 +246,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 
 	/* gather all the data */
-	starpu_data_unpartition(dataA, 0);
+	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 	starpu_data_unregister(dataA);
 
 	free(piv_description);
@@ -270,7 +270,7 @@ int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, uns
 	for (bj = 0; bj < nblocks; bj++)
 	for (bi = 0; bi < nblocks; bi++)
 	{
-		starpu_matrix_data_register(&dataAp[bi+nblocks*bj], 0,
+		starpu_matrix_data_register(&dataAp[bi+nblocks*bj], STARPU_MAIN_RAM,
 			(uintptr_t)matA[bi+nblocks*bj], size/nblocks,
 			size/nblocks, size/nblocks, sizeof(TYPE));
 	}

+ 3 - 3
examples/lu/xlu_pivot.c

@@ -338,7 +338,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
+	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 
 	/* We already enforce deps by hand */
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
@@ -390,7 +390,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 
 	/* gather all the data */
-	starpu_data_unpartition(dataA, 0);
+	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 	free(piv_description);
 
 	return ret;
@@ -413,7 +413,7 @@ int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, uns
 	for (bj = 0; bj < nblocks; bj++)
 	for (bi = 0; bi < nblocks; bi++)
 	{
-		starpu_matrix_data_register(&dataAp[bi+nblocks*bj], 0,
+		starpu_matrix_data_register(&dataAp[bi+nblocks*bj], STARPU_MAIN_RAM,
 			(uintptr_t)matA[bi+nblocks*bj], size/nblocks,
 			size/nblocks, size/nblocks, sizeof(TYPE));
 

+ 1 - 1
examples/mandelbrot/mandelbrot.c

@@ -501,7 +501,7 @@ int main(int argc, char **argv)
 	for (iby = 0; iby < nblocks; iby++)
 	{
 		unsigned *data = &buffer[iby*block_size*width];
-		starpu_vector_data_register(&block_handles[iby], 0,
+		starpu_vector_data_register(&block_handles[iby], STARPU_MAIN_RAM,
                         (uintptr_t)data, block_size*width, sizeof(unsigned));
 	}
 

+ 3 - 3
examples/matvecmult/matvecmult.c

@@ -186,9 +186,9 @@ int main(int argc, char **argv)
         fillArray(mult, height);
         matVecMult(matrix, vector, width, height, correctResult);
 
-	starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix, width, width, height, sizeof(float));
-	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, width, sizeof(float));
-	starpu_vector_data_register(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
+	starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
+	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, width, sizeof(float));
+	starpu_vector_data_register(&mult_handle, STARPU_MAIN_RAM, (uintptr_t)mult, height, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL
         ret = starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code, NULL);

+ 6 - 6
examples/mult/xgemm.c

@@ -110,11 +110,11 @@ static void init_problem_data(void)
 
 static void partition_mult_data(void)
 {
-	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
 		ydim, ydim, zdim, sizeof(TYPE));
-	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
 		zdim, zdim, xdim, sizeof(TYPE));
-	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
 		ydim, ydim, xdim, sizeof(TYPE));
 
 	struct starpu_data_filter vert;
@@ -346,9 +346,9 @@ int main(int argc, char **argv)
 	FPRINTF(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
 
 enodev:
-	starpu_data_unpartition(C_handle, 0);
-	starpu_data_unpartition(B_handle, 0);
-	starpu_data_unpartition(A_handle, 0);
+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
 
 	starpu_data_unregister(A_handle);
 	starpu_data_unregister(B_handle);

+ 1 - 1
examples/openmp/vector_scal_omp.c

@@ -96,7 +96,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_data_handle_t vector_handle;
-	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 	float factor = 1.001;
 

+ 3 - 3
examples/pi/pi.c

@@ -135,13 +135,13 @@ int main(int argc, char **argv)
 
 	/* Any worker may use that array now */
 	starpu_data_handle_t sobol_qrng_direction_handle;
-	starpu_vector_data_register(&sobol_qrng_direction_handle, 0,
+	starpu_vector_data_register(&sobol_qrng_direction_handle, STARPU_MAIN_RAM,
 		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
 
 	unsigned *cnt_array = malloc(ntasks*sizeof(unsigned));
 	STARPU_ASSERT(cnt_array);
 	starpu_data_handle_t cnt_array_handle;
-	starpu_vector_data_register(&cnt_array_handle, 0, (uintptr_t)cnt_array, ntasks, sizeof(unsigned));
+	starpu_vector_data_register(&cnt_array_handle, STARPU_MAIN_RAM, (uintptr_t)cnt_array, ntasks, sizeof(unsigned));
 
 	/* Use a write-through policy : when the data is modified on an
 	 * accelerator, we know that it will only be modified once and be
@@ -179,7 +179,7 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 
 	/* Get the cnt_array back in main memory */
-	starpu_data_unpartition(cnt_array_handle, 0);
+	starpu_data_unpartition(cnt_array_handle, STARPU_MAIN_RAM);
 	starpu_data_unregister(cnt_array_handle);
 	starpu_data_unregister(sobol_qrng_direction_handle);
 

+ 1 - 1
examples/pi/pi_redux.c

@@ -334,7 +334,7 @@ int main(int argc, char **argv)
 	 * [-1,1]^2. */
 	unsigned long shot_cnt = 0;
 	starpu_data_handle_t shot_cnt_handle;
-	starpu_variable_data_register(&shot_cnt_handle, 0,
+	starpu_variable_data_register(&shot_cnt_handle, STARPU_MAIN_RAM,
 			(uintptr_t)&shot_cnt, sizeof(shot_cnt));
 
 	starpu_data_set_reduction_methods(shot_cnt_handle,

+ 6 - 6
examples/ppm_downscaler/yuv_downscaler.c

@@ -159,39 +159,39 @@ int main(int argc, char **argv)
 	for (frame = 0; frame < nframes; frame++)
 	{
 		/* register Y layer */
-		starpu_matrix_data_register(&frame_y_handle[frame], 0,
+		starpu_matrix_data_register(&frame_y_handle[frame], STARPU_MAIN_RAM,
 			(uintptr_t)&yuv_in_buffer[frame].y,
 			WIDTH, WIDTH, HEIGHT, sizeof(uint8_t));
 
 		starpu_data_partition(frame_y_handle[frame], &filter_y);
 
-		starpu_matrix_data_register(&new_frame_y_handle[frame], 0,
+		starpu_matrix_data_register(&new_frame_y_handle[frame], STARPU_MAIN_RAM,
 			(uintptr_t)&yuv_out_buffer[frame].y,
 			NEW_WIDTH, NEW_WIDTH, NEW_HEIGHT, sizeof(uint8_t));
 
 		starpu_data_partition(new_frame_y_handle[frame], &filter_y);
 
 		/* register U layer */
-		starpu_matrix_data_register(&frame_u_handle[frame], 0,
+		starpu_matrix_data_register(&frame_u_handle[frame], STARPU_MAIN_RAM,
 			(uintptr_t)&yuv_in_buffer[frame].u,
 			WIDTH/2, WIDTH/2, HEIGHT/2, sizeof(uint8_t));
 
 		starpu_data_partition(frame_u_handle[frame], &filter_uv);
 
-		starpu_matrix_data_register(&new_frame_u_handle[frame], 0,
+		starpu_matrix_data_register(&new_frame_u_handle[frame], STARPU_MAIN_RAM,
 			(uintptr_t)&yuv_out_buffer[frame].u,
 			NEW_WIDTH/2, NEW_WIDTH/2, NEW_HEIGHT/2, sizeof(uint8_t));
 
 		starpu_data_partition(new_frame_u_handle[frame], &filter_uv);
 
 		/* register V layer */
-		starpu_matrix_data_register(&frame_v_handle[frame], 0,
+		starpu_matrix_data_register(&frame_v_handle[frame], STARPU_MAIN_RAM,
 			(uintptr_t)&yuv_in_buffer[frame].v,
 			WIDTH/2, WIDTH/2, HEIGHT/2, sizeof(uint8_t));
 
 		starpu_data_partition(frame_v_handle[frame], &filter_uv);
 
-		starpu_matrix_data_register(&new_frame_v_handle[frame], 0,
+		starpu_matrix_data_register(&new_frame_v_handle[frame], STARPU_MAIN_RAM,
 			(uintptr_t)&yuv_out_buffer[frame].v,
 			NEW_WIDTH/2, NEW_WIDTH/2, NEW_HEIGHT/2, sizeof(uint8_t));
 

+ 3 - 3
examples/reductions/dot_product.c

@@ -370,13 +370,13 @@ int main(int argc, char **argv)
 	unsigned block;
 	for (block = 0; block < _nblocks; block++)
 	{
-		starpu_vector_data_register(&_x_handles[block], 0,
+		starpu_vector_data_register(&_x_handles[block], STARPU_MAIN_RAM,
 			(uintptr_t)&_x[_entries_per_block*block], _entries_per_block, sizeof(float));
-		starpu_vector_data_register(&_y_handles[block], 0,
+		starpu_vector_data_register(&_y_handles[block], STARPU_MAIN_RAM,
 			(uintptr_t)&_y[_entries_per_block*block], _entries_per_block, sizeof(float));
 	}
 
-	starpu_variable_data_register(&_dot_handle, 0, (uintptr_t)&_dot, sizeof(DOT_TYPE));
+	starpu_variable_data_register(&_dot_handle, STARPU_MAIN_RAM, (uintptr_t)&_dot, sizeof(DOT_TYPE));
 
 	/*
 	 *	Compute dot product with StarPU

+ 2 - 2
examples/reductions/minmax_reduction.c

@@ -161,7 +161,7 @@ int main(int argc, char **argv)
 	for (block = 0; block < _nblocks; block++)
 	{
 		uintptr_t block_start = (uintptr_t)&_x[_entries_per_bock*block];
-		starpu_vector_data_register(&_x_handles[block], 0, block_start,
+		starpu_vector_data_register(&_x_handles[block], STARPU_MAIN_RAM, block_start,
 					    _entries_per_bock, sizeof(TYPE));
 	}
 
@@ -171,7 +171,7 @@ int main(int argc, char **argv)
 	/* Initialize current max */
 	_minmax[1] = TYPE_MIN;
 
-	starpu_variable_data_register(&_minmax_handle, 0, (uintptr_t)_minmax, 2*sizeof(TYPE));
+	starpu_variable_data_register(&_minmax_handle, STARPU_MAIN_RAM, (uintptr_t)_minmax, 2*sizeof(TYPE));
 
 	/* Set the methods to define neutral elements and to perform the reduction operation */
 	starpu_data_set_reduction_methods(_minmax_handle, &minmax_redux_codelet, &minmax_init_codelet);

+ 1 - 1
examples/spmd/vector_scal_spmd.c

@@ -114,7 +114,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_data_handle_t vector_handle;
-	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 	float factor = 1.001;
 

+ 7 - 7
examples/spmv/dw_block_spmv.c

@@ -47,7 +47,7 @@ void create_data(void)
 	bcsr_matrix = mm_file_to_bcsr(inputfile, c, r);
 
 	/* declare the corresponding block CSR to the runtime */
-	starpu_bcsr_data_register(&sparse_matrix, 0, bcsr_matrix->nnz_blocks, bcsr_matrix->nrows_blocks,
+	starpu_bcsr_data_register(&sparse_matrix, STARPU_MAIN_RAM, bcsr_matrix->nnz_blocks, bcsr_matrix->nrows_blocks,
 	                (uintptr_t)bcsr_matrix->val, bcsr_matrix->colind, bcsr_matrix->rowptr, 
 			0, bcsr_matrix->r, bcsr_matrix->c, sizeof(float));
 
@@ -69,16 +69,16 @@ void create_data(void)
 		vector_out_ptr[ind] = 0.0f;
 	}
 
-	starpu_vector_data_register(&vector_in, 0, (uintptr_t)vector_in_ptr, size, sizeof(float));
-	starpu_vector_data_register(&vector_out, 0, (uintptr_t)vector_out_ptr, size, sizeof(float));
+	starpu_vector_data_register(&vector_in, STARPU_MAIN_RAM, (uintptr_t)vector_in_ptr, size, sizeof(float));
+	starpu_vector_data_register(&vector_out, STARPU_MAIN_RAM, (uintptr_t)vector_out_ptr, size, sizeof(float));
 }
 
 void unregister_data(void)
 {
-	starpu_data_unpartition(sparse_matrix, 0);
+	starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM);
 	starpu_data_unregister(sparse_matrix);
 
-	starpu_data_unpartition(vector_in, 0);
+	starpu_data_unpartition(vector_in, STARPU_MAIN_RAM);
 	starpu_data_unregister(vector_in);
 
 	starpu_data_unregister(vector_out);
@@ -98,8 +98,8 @@ void init_problem_callback(void *arg)
 		printf("DONE ...\n");
 		gettimeofday(&end, NULL);
 
-/*		starpu_data_unpartition(sparse_matrix, 0); */
-		starpu_data_unpartition(vector_out, 0);
+/*		starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM); */
+		starpu_data_unpartition(vector_out, STARPU_MAIN_RAM);
 
 		sem_post(&sem);
 	}

+ 5 - 5
examples/spmv/spmv.c

@@ -192,9 +192,9 @@ int main(int argc, char **argv)
 	/*
 	 *	Register the CSR matrix and the 2 vectors
 	 */
-	starpu_csr_data_register(&sparse_matrix, 0, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float));
-	starpu_vector_data_register(&vector_in, 0, (uintptr_t)vector_in_ptr, size, sizeof(float));
-	starpu_vector_data_register(&vector_out, 0, (uintptr_t)vector_out_ptr, size, sizeof(float));
+	starpu_csr_data_register(&sparse_matrix, STARPU_MAIN_RAM, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float));
+	starpu_vector_data_register(&vector_in, STARPU_MAIN_RAM, (uintptr_t)vector_in_ptr, size, sizeof(float));
+	starpu_vector_data_register(&vector_out, STARPU_MAIN_RAM, (uintptr_t)vector_out_ptr, size, sizeof(float));
 
 	/*
 	 *	Partition the CSR matrix and the output vector
@@ -239,8 +239,8 @@ int main(int argc, char **argv)
 	/*
 	 *	Unregister the CSR matrix and the output vector
 	 */
-	starpu_data_unpartition(sparse_matrix, 0);
-	starpu_data_unpartition(vector_out, 0);
+	starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM);
+	starpu_data_unpartition(vector_out, STARPU_MAIN_RAM);
 
 	/*
 	 *	Unregister data

+ 1 - 1
examples/stencil/stencil-blocks.c

@@ -262,7 +262,7 @@ static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr,
 	memset(*ptr, 0, block_size);
 
 	/* Register it to StarPU */
-	starpu_block_data_register(handleptr, 0, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
+	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
 }
 
 void display_memory_consumption(int rank)

+ 1 - 1
gcc-plugin/tests/output-pointer.c

@@ -82,7 +82,7 @@ main (int argc, char *argv[])
   expected_register_arguments.pointer = x;
   expected_register_arguments.elements = 42;
   expected_register_arguments.element_size = sizeof x[0];
-  starpu_vector_data_register (&handle, 0, (uintptr_t) x, 42, sizeof x[0]);
+  starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) x, 42, sizeof x[0]);
 
   struct insert_task_argument expected[] =
     {

+ 2 - 2
gcc-plugin/tests/pointers.c

@@ -85,12 +85,12 @@ main (int argc, char *argv[])
   expected_register_arguments.pointer = x;
   expected_register_arguments.elements = 1;
   expected_register_arguments.element_size = sizeof x[0];
-  starpu_vector_data_register (&handle, 0, (uintptr_t) x, 1, sizeof x[0]);
+  starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) x, 1, sizeof x[0]);
 
   expected_register_arguments.pointer = y;
   expected_register_arguments.elements = 1;
   expected_register_arguments.element_size = sizeof *y;
-  starpu_vector_data_register (&handle, 0, (uintptr_t) y, 1, sizeof *y);
+  starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) y, 1, sizeof *y);
 
   struct insert_task_argument expected_pointer_task[] =
     {

+ 1 - 0
include/starpu.h

@@ -45,6 +45,7 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_thread_util.h>
 #include <starpu_util.h>
 #include <starpu_data.h>
+#include <starpu_disk.h>
 #include <starpu_data_interfaces.h>
 #include <starpu_data_filters.h>
 #include <starpu_stdlib.h>

+ 4 - 0
include/starpu_data.h

@@ -90,15 +90,19 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node);
 
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async);
 
+#define STARPU_MAIN_RAM 0
+
 enum starpu_node_kind
 {
 	STARPU_UNUSED     = 0x00,
 	STARPU_CPU_RAM    = 0x01,
 	STARPU_CUDA_RAM   = 0x02,
 	STARPU_OPENCL_RAM = 0x03,
+	STARPU_DISK_RAM   = 0x04,
 	STARPU_MIC_RAM    = 0x05,
 	STARPU_SCC_RAM    = 0x06,
 	STARPU_SCC_SHM    = 0x07
+
 };
 
 unsigned starpu_worker_get_memory_node(unsigned workerid);

+ 11 - 11
include/starpu_data_interfaces.h

@@ -101,24 +101,24 @@ enum starpu_data_interface_id
 
 struct starpu_data_interface_ops
 {
-	void (*register_data_handle)(starpu_data_handle_t handle,
-				     unsigned home_node, void *data_interface);
-	starpu_ssize_t (*allocate_data_on_node)(void *data_interface, unsigned node);
-	void (*free_data_on_node)(void *data_interface, unsigned node);
+	void		 (*register_data_handle)	(starpu_data_handle_t handle,
+								unsigned home_node, void *data_interface);
+	starpu_ssize_t	 (*allocate_data_on_node)	(void *data_interface, unsigned node);
+	void 		 (*free_data_on_node)		(void *data_interface, unsigned node);
 	const struct starpu_data_copy_methods *copy_methods;
-	void * (*handle_to_pointer)(starpu_data_handle_t handle, unsigned node);
-	size_t (*get_size)(starpu_data_handle_t handle);
-	uint32_t (*footprint)(starpu_data_handle_t handle);
-	int (*compare)(void *data_interface_a, void *data_interface_b);
-	void (*display)(starpu_data_handle_t handle, FILE *f);
+	void * 		 (*handle_to_pointer)		(starpu_data_handle_t handle, unsigned node);
+	size_t 		 (*get_size)			(starpu_data_handle_t handle);
+	uint32_t 	 (*footprint)			(starpu_data_handle_t handle);
+	int 		 (*compare)			(void *data_interface_a, void *data_interface_b);
+	void 		 (*display)			(starpu_data_handle_t handle, FILE *f);
 	enum starpu_data_interface_id interfaceid;
 	size_t interface_size;
 
 	int is_multiformat;
 	struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface);
 
-	int (*pack_data)(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
-	int (*unpack_data)(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+	int (*pack_data) (starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
+	int (*unpack_data) (starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
 };
 
 int starpu_data_interface_get_next_id(void);

+ 0 - 0
include/starpu_deprecated_api.h


部分文件因文件數量過多而無法顯示