13 gadi atpakaļ · 41f3054013
--- a/ChangeLog
+++ b/ChangeLog
@@ -59,6 +59,9 @@ Changes:
 
				       of the libOpenCL.so file of the OCL ICD implementation.
			
 
				   * Fix main memory leak on multiple unregister/re-register.
			
 
				   * Improve hwloc detection by configure
			
 
				+  * Cell:
			
 
				+    - It is no longer possible to enable the cell support via the
			
 
				+      gordon driver
			
 
				 
			
 
				 Small changes:
			
 
				   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
			
--- a/configure.ac
+++ b/configure.ac
@@ -775,72 +775,26 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
 
				    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
			
 
				 fi
			
 
				 
			
 
				-
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				-#                                 Cell settings                               #
			
 
				+#                                 Cell                                        #
			
 
				 #                                                                             #
			
 
				 ###############################################################################
			
 
				 
			
 
				-#TODO fix the default dir
			
 
				-AC_ARG_ENABLE(gordon, [AS_HELP_STRING([--enable-gordon],
			
 
				-			[use Cell SPUs])],, enable_gordon=maybe)
			
 
				-
			
 
				-if test x$enable_gordon = xyes -o x$enable_gordon = xmaybe; then
			
 
				-
			
 
				-	AC_ARG_WITH(gordon-dir, [AS_HELP_STRING([--with-gordon-dir=<path>],
			
 
				-			[specify Gordon installation directory (default is /usr/local/)])],
			
 
				-			[
			
 
				-				gordon_dir="$withval"
			
 
				-				enable_gordon=yes
			
 
				-			], gordon_dir=/usr/local/)
			
 
				-
			
 
				-	# do we have a valid Gordon setup ?
			
 
				-	have_valid_gordon=yes
			
 
				-
			
 
				-	# can we use dynamic code loading facilities ?
			
 
				-	AC_CHECK_LIB(elf, elf_memory,, [have_valid_gordon=no])
			
 
				-
			
 
				-	AC_CHECK_LIB(spe2, spe_context_create,,[have_valid_gordon=no])
			
 
				-	AC_CHECK_FUNC(spe_in_mbox_write, [], [have_valid_gordon=no])
			
 
				-
			
 
				-	PKG_PROG_PKG_CONFIG
			
 
				-	if test -d "$gordon_dir"; then
			
 
				-		PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:$gordon_dir"
			
 
				-	fi
			
 
				-	AC_SUBST(PKG_CONFIG_PATH)
			
 
				-	PKG_CHECK_MODULES([GORDON], [libgordon], [], have_valid_gordon=no)
			
 
				-
			
 
				-	CPPFLAGS="${CPPFLAGS} ${GORDON_CFLAGS}"
			
 
				-	LIBS="${LIBS} ${GORDON_LIBS}"
			
 
				-
			
 
				-	# AC_CHECK_FUNC(gordon_init, [gordon], [have_valid_gordon=no])
			
 
				-
			
 
				-	# in case Gordon was explicitely required, but is not available, this is an error
			
 
				-	if test x$enable_gordon = xyes -a x$have_valid_gordon = xno; then
			
 
				-		AC_MSG_ERROR([cannot find Gordon])
			
 
				-	fi
			
 
				-
			
 
				+# warning: Cell driver has been removed from configure, but as the
			
 
				+# source code is still available, we need to define the minimum
			
 
				+# requirements to compile
			
 
				+AC_DEFINE_UNQUOTED(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
			
 
				 
			
 
				-	# now we enable Gordon if and only if a proper setup is available
			
 
				-	enable_gordon=$have_valid_gordon
			
 
				-	AC_DEFINE(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
			
 
				-fi
			
 
				-
			
 
				-AC_MSG_CHECKING(whether GORDON should be used)
			
 
				-AC_MSG_RESULT($enable_gordon)
			
 
				-AC_SUBST(STARPU_USE_GORDON, $enable_gordon)
			
 
				-AM_CONDITIONAL(STARPU_USE_GORDON, test x$enable_gordon = xyes)
			
 
				-
			
 
				-if test x$enable_gordon = xyes; then
			
 
				-	AC_DEFINE(STARPU_USE_GORDON, [1], [Cell support is enabled])
			
 
				-	GORDON_REQUIRES=gordon
			
 
				-fi
			
 
				-AC_SUBST(GORDON_REQUIRES)
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                                 Drivers                                     #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				 
			
 
				 AC_MSG_CHECKING(whether blocking drivers should be disabled)
			
 
				 AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--disable-blocking-drivers], [disable blocking drivers])],
			
 
				-				enable_blocking=$enableval, enable_blocking=$enable_gordon)
			
 
				+				enable_blocking=$enableval, enable_blocking=no)
			
 
				 AC_MSG_RESULT($enable_blocking)
			
 
				 
			
 
				 if test x$enable_blocking = xno; then
			
@@ -1015,8 +969,7 @@ AC_MSG_RESULT($nmaxbuffers)
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
			
 
				 		[how many buffers can be manipulated per task])
			
 
				 
			
 
				-# We have one memory node shared by all CPU workers, one node per GPU, and
			
 
				-# currently the Cell driver is using the same memory node as the CPU.
			
 
				+# We have one memory node shared by all CPU workers, one node per GPU
			
 
				 maxnodes=1
			
 
				 if test x$enable_cuda = xyes ; then
			
 
				 	# we could have used nmaxcudadev + 1, but this would certainly give an
			
@@ -1406,12 +1359,6 @@ AC_ARG_ENABLE([socl],
 
				 
			
 
				 AC_MSG_CHECKING(for SOCL)
			
 
				 
			
 
				-if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
			
 
				-    if test "$have_valid_opencl" = "no" ; then
			
 
				-	STARPU_LOOK_FOR_OPENCL()
			
 
				-    fi
			
 
				-fi
			
 
				-
			
 
				 # in case SOCL was explicitely required, but is not available, this is an error
			
 
				 if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
			
 
				     AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
			
@@ -1865,7 +1812,6 @@ AC_MSG_NOTICE([
 
				 	CPUs   enabled: $enable_cpu
			
 
				 	CUDA   enabled: $enable_cuda
			
 
				 	OpenCL enabled: $enable_opencl
			
 
				-	Cell   enabled: $enable_gordon
			
 
				 
			
 
				 	Compile-time limits
			
 
				 	(change these with --enable-maxcpus, --enable-maxcudadev,
			
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -61,9 +61,6 @@ Compare the data size of two interfaces.
 
				 @item @code{ void (*display)(starpu_data_handle_t handle, FILE *f)}
			
 
				 Dump the sizes of a handle to a file.
			
 
				 
			
 
				-@item @code{ int (*convert_to_gordon)(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)}
			
 
				-Convert the data size to the spu size format. If no SPUs are used, this field can be seto NULL.
			
 
				-
			
 
				 @item @code{enum starpu_data_interface_id interfaceid}
			
 
				 An identifier that is unique to each interface.
			
 
				 
			
@@ -534,9 +531,6 @@ Actual number of CUDA workers used by StarPU.
 
				 @item @code{unsigned nopenclgpus}
			
 
				 Actual number of OpenCL workers used by StarPU.
			
 
				 
			
 
				-@item @code{unsigned ngordon_spus}
			
 
				-Actual number of Gordon workers used by StarPU.
			
 
				-
			
 
				 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
			
 
				 Indicates the successive cpu identifier that should be used to bind the
			
 
				 workers. It is either filled according to the user's explicit
			
@@ -699,7 +693,7 @@ assign tasks to the different workers.
 
				 Remove all available tasks from the scheduler (tasks are chained by the means
			
 
				 of the prev and next fields of the starpu_task structure). The mutex associated
			
 
				 to the worker is already taken when this method is called. This is currently
			
 
				-only used by the Gordon driver.
			
 
				+not used.
			
 
				 
			
 
				 @item @code{void (*pre_exec_hook)(struct starpu_task *)} (optional)
			
 
				 This method is called every time a task is starting.
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -20,7 +20,6 @@
 
				 * Profiling API::
			
 
				 * CUDA extensions::
			
 
				 * OpenCL extensions::
			
 
				-* Cell extensions::
			
 
				 * Miscellaneous helpers::
			
 
				 @end menu
			
 
				 
			
@@ -94,10 +93,6 @@ be specified with the @code{STARPU_NCUDA} environment variable.
 
				 This is the number of OpenCL devices that StarPU can use. This can
			
 
				 also be specified with the @code{STARPU_NOPENCL} environment variable.
			
 
				 
			
 
				-@item @code{int nspus} (default = -1)
			
 
				-This is the number of Cell SPUs that StarPU can use. This can also be
			
 
				-specified with the @code{STARPU_NGORDON} environment variable.
			
 
				-
			
 
				 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
			
 
				 If this flag is set, the @code{workers_bindid} array indicates where the
			
 
				 different workers are bound, otherwise StarPU automatically selects where to
			
@@ -243,7 +238,6 @@ The different values are:
 
				 @item @code{STARPU_CPU_WORKER}
			
 
				 @item @code{STARPU_CUDA_WORKER}
			
 
				 @item @code{STARPU_OPENCL_WORKER}
			
 
				-@item @code{STARPU_GORDON_WORKER}
			
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
@@ -273,10 +267,6 @@ This function returns the number of OpenCL devices controlled by StarPU. The ret
 
				 value should be at most @code{STARPU_MAXOPENCLDEVS}.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun unsigned starpu_spu_worker_get_count (void)
			
 
				-This function returns the number of Cell SPUs controlled by StarPU.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @deftypefun int starpu_worker_get_id (void)
			
 
				 This function returns the identifier of the current worker, i.e the one associated to the calling
			
 
				 thread. The returned value is either -1 if the current context is not a StarPU
			
@@ -311,9 +301,8 @@ This function returns the type of processing unit associated to a
 
				 worker. The worker identifier is a value returned by the
			
 
				 @code{starpu_worker_get_id} function). The returned value
			
 
				 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
			
 
				-core, @code{STARPU_CUDA_WORKER} for a CUDA device,
			
 
				-@code{STARPU_OPENCL_WORKER} for a OpenCL device, and
			
 
				-@code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
			
 
				+core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
			
 
				+@code{STARPU_OPENCL_WORKER} for a OpenCL device. The value returned for an invalid
			
 
				 identifier is unspecified.
			
 
				 @end deftypefun
			
 
				 
			
@@ -339,7 +328,6 @@ todo
 
				 @item @code{STARPU_CPU_RAM}
			
 
				 @item @code{STARPU_CUDA_RAM}
			
 
				 @item @code{STARPU_OPENCL_RAM}
			
 
				-@item @code{STARPU_SPU_LS}
			
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
@@ -1150,7 +1138,7 @@ Return the size of the elements registered into the matrix designated by @var{in
 
				 Return a pointer to the column array of the matrix designated by
			
 
				 @var{interface}.
			
 
				 @end defmac
			
 
				-@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE({void *}@var{interface})
			
 
				+@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE ({void *}@var{interface})
			
 
				 Return a device handle for the column array of the matrix designated by
			
 
				 @var{interface}, to be used on OpenCL. The offset documented below has to be
			
 
				 used in addition to this.
			
@@ -1158,7 +1146,7 @@ used in addition to this.
 
				 @defmac STARPU_COO_GET_ROWS (interface)
			
 
				 Return a pointer to the rows array of the matrix designated by @var{interface}.
			
 
				 @end defmac
			
 
				-@defmac STARPU_COO_GET_ROWS_DEV_HANDLE({void *}@var{interface})
			
 
				+@defmac STARPU_COO_GET_ROWS_DEV_HANDLE ({void *}@var{interface})
			
 
				 Return a device handle for the row array of the matrix designated by
			
 
				 @var{interface}, to be used on OpenCL. The offset documented below has to be
			
 
				 used in addition to this.
			
@@ -1167,12 +1155,12 @@ used in addition to this.
 
				 Return a pointer to the values array of the matrix designated by
			
 
				 @var{interface}.
			
 
				 @end defmac
			
 
				-@defmac STARPU_COO_GET_VALUES_DEV_HANDLE({void *}@var{interface})
			
 
				+@defmac STARPU_COO_GET_VALUES_DEV_HANDLE ({void *}@var{interface})
			
 
				 Return a device handle for the value array of the matrix designated by
			
 
				 @var{interface}, to be used on OpenCL. The offset documented below has to be
			
 
				 used in addition to this.
			
 
				 @end defmac
			
 
				-@defmac STARPU_COO_GET_OFFSET({void *}@var{itnerface})
			
 
				+@defmac STARPU_COO_GET_OFFSET ({void *}@var{itnerface})
			
 
				 Return the offset in the arrays of the COO matrix designated by @var{interface}.
			
 
				 @end defmac
			
 
				 @defmac STARPU_COO_GET_NX (interface)
			
@@ -1493,18 +1481,6 @@ starpu_codelet} to specify the codelet may be executed on a CUDA
 
				 processing unit.
			
 
				 @end defmac
			
 
				 
			
 
				-@defmac STARPU_SPU
			
 
				-This macro is used when setting the field @code{where} of a @code{struct
			
 
				-starpu_codelet} to specify the codelet may be executed on a SPU
			
 
				-processing unit.
			
 
				-@end defmac
			
 
				-
			
 
				-@defmac STARPU_GORDON
			
 
				-This macro is used when setting the field @code{where} of a @code{struct
			
 
				-starpu_codelet} to specify the codelet may be executed on a Cell
			
 
				-processing unit.
			
 
				-@end defmac
			
 
				-
			
 
				 @defmac STARPU_OPENCL
			
 
				 This macro is used when setting the field @code{where} of a @code{struct
			
 
				 starpu_codelet} to specify the codelet may be executed on a OpenCL
			
@@ -1542,12 +1518,12 @@ e.g. static storage case.
 
				 @item @code{uint32_t where} (optional)
			
 
				 Indicates which types of processing units are able to execute the
			
 
				 codelet. The different values
			
 
				-@code{STARPU_CPU}, @code{STARPU_CUDA}, @code{STARPU_SPU},
			
 
				-@code{STARPU_GORDON}, @code{STARPU_OPENCL} can be combined to specify
			
 
				+@code{STARPU_CPU}, @code{STARPU_CUDA}, 
			
 
				+@code{STARPU_OPENCL} can be combined to specify
			
 
				 on which types of processing units the codelet can be executed.
			
 
				 @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
			
 
				-implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
			
 
				-indicates that it is only available on Cell SPUs. If the field is
			
 
				+implemented for both CPU cores and CUDA devices while @code{STARPU_OPENCL}
			
 
				+indicates that it is only available on OpenCL devices. If the field is
			
 
				 unset, its value will be automatically set based on the availability
			
 
				 of the @code{XXX_funcs} fields defined below.
			
 
				 
			
@@ -1607,17 +1583,6 @@ If the @code{where} field is set, then the @code{opencl_funcs} field
 
				 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
			
 
				 field, it must be non-null otherwise.
			
 
				 
			
 
				-@item @code{uint8_t gordon_func} (optional)
			
 
				-This field has been made deprecated. One should use instead the
			
 
				-@code{gordon_funcs} field.
			
 
				-
			
 
				-@item @code{uint8_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS]} (optional)
			
 
				-Is an array of index of the Cell SPU implementations of the codelet within the
			
 
				-Gordon library.
			
 
				-It must be terminated by a NULL value.
			
 
				-See Gordon documentation for more details on how to register a kernel and
			
 
				-retrieve its index.
			
 
				-
			
 
				 @item @code{unsigned nbuffers}
			
 
				 Specifies the number of arguments taken by the codelet. These arguments are
			
 
				 managed by the DSM and are accessed from the @code{void *buffers[]}
			
@@ -1726,17 +1691,17 @@ by the DSM.
 
				 @item @code{void *cl_arg} (optional; default: @code{NULL})
			
 
				 This pointer is passed to the codelet through the second argument
			
 
				 of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
			
 
				-In the specific case of the Cell processor, see the @code{cl_arg_size}
			
 
				-argument.
			
 
				-
			
 
				-@item @code{size_t cl_arg_size} (optional, Cell-specific)
			
 
				-In the case of the Cell processor, the @code{cl_arg} pointer is not directly
			
 
				-given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
			
 
				-the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
			
 
				-at address @code{cl_arg}. In this case, the argument given to the SPU codelet
			
 
				-is therefore not the @code{cl_arg} pointer, but the address of the buffer in
			
 
				-local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
			
 
				-codelets, where the @code{cl_arg} pointer is given as such.
			
 
				+
			
 
				+@item @code{size_t cl_arg_size} (optional)
			
 
				+For some specific drivers, the @code{cl_arg} pointer cannot not be directly
			
 
				+given to the driver function. A buffer of size @code{cl_arg_size}
			
 
				+needs to be allocated on the driver. This buffer is then filled with
			
 
				+the @code{cl_arg_size} bytes starting at address @code{cl_arg}. In
			
 
				+this case, the argument given to the codelet is therefore not the
			
 
				+@code{cl_arg} pointer, but the address of the buffer in local store
			
 
				+(LS) instead.
			
 
				+This field is ignored for CPU, CUDA and OpenCL codelets, where the
			
 
				+@code{cl_arg} pointer is given as such.
			
 
				 
			
 
				 @item @code{void (*callback_func)(void *)} (optional) (default: @code{NULL})
			
 
				 This is a function pointer of prototype @code{void (*f)(void *)} which
			
@@ -2089,7 +2054,6 @@ OpenCL types range within STARPU_OPENCL_DEFAULT (GPU number 0), STARPU_OPENCL_DE
 
				 @item @code{STARPU_CPU_DEFAULT}
			
 
				 @item @code{STARPU_CUDA_DEFAULT}
			
 
				 @item @code{STARPU_OPENCL_DEFAULT}
			
 
				-@item @code{STARPU_GORDON_DEFAULT}
			
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
@@ -2742,11 +2706,6 @@ otherwise. The integer pointed to by @var{ret} is set to -EAGAIN if the asynchro
 
				 was successful, or to 0 if event was NULL.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@node Cell extensions
			
 
				-@section Cell extensions
			
 
				-
			
 
				-nothing yet.
			
 
				-
			
 
				 @node Miscellaneous helpers
			
 
				 @section Miscellaneous helpers
			
 
				 
			
--- a/doc/chapters/basic-examples.texi
+++ b/doc/chapters/basic-examples.texi
@@ -161,7 +161,7 @@ struct starpu_codelet cl =
 
				 
			
 
				 A codelet is a structure that represents a computational kernel. Such a codelet
			
 
				 may contain an implementation of the same kernel on different architectures
			
 
				-(e.g. CUDA, Cell's SPU, x86, ...). For compatibility, make sure that the whole
			
 
				+(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
			
 
				 structure is initialized to zero, either by using memset, or by letting the
			
 
				 compiler implicitly do it as examplified above.
			
 
				 
			
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -121,13 +121,6 @@ Search for an OpenCL library under @var{dir}, which should notably
 
				 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
			
 
				 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
			
 
				 
			
 
				-@item --enable-gordon
			
 
				-Enable the use of the Gordon runtime for Cell SPUs.
			
 
				-@c TODO: rather default to enabled when detected
			
 
				-
			
 
				-@item --with-gordon-dir=@var{prefix}
			
 
				-Search for the Gordon SDK under @var{prefix}.
			
 
				-
			
 
				 @item --enable-maximplementations=@var{count}
			
 
				 Allow for at most @var{count} codelet implementations for the same
			
 
				 target device.  This information is then available as the
			
@@ -292,9 +285,6 @@ create as many CUDA workers as there are CUDA devices.
 
				 @item @code{STARPU_NOPENCL}
			
 
				 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
			
 
				 
			
 
				-@item @code{STARPU_NGORDON}
			
 
				-Specify the number of SPUs that StarPU can use.
			
 
				-
			
 
				 @item @code{STARPU_WORKERS_NOBIND}
			
 
				 Setting it to non-zero will prevent StarPU from binding its threads to
			
 
				 CPUs. This is for instance useful when running the testsuite in parallel.
			
@@ -309,7 +299,7 @@ determined by the OS, or provided by the @code{hwloc} library in case it is
 
				 available.
			
 
				 
			
 
				 Note that the first workers correspond to the CUDA workers, then come the
			
 
				-OpenCL and the SPU, and finally the CPU workers. For example if
			
 
				+OpenCL workers, and finally the CPU workers. For example if
			
 
				 we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
			
 
				 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
			
 
				 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
			
--- a/doc/chapters/introduction.texi
+++ b/doc/chapters/introduction.texi
@@ -83,7 +83,7 @@ The remainder of this section describes the main concepts used in StarPU.
 
				 @cindex codelet
			
 
				 One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
			
 
				 computational kernel that can possibly be implemented on multiple architectures
			
 
				-such as a CPU, a CUDA device or a Cell's SPU.
			
 
				+such as a CPU, a CUDA device or an OpenCL device.
			
 
				 
			
 
				 @c TODO insert illustration f: f_spu, f_cpu, ...
			
 
				 
			
@@ -150,7 +150,7 @@ A @b{worker} execute tasks. There is typically one per CPU computation core and
 
				 one per accelerator (for which a whole CPU core is dedicated).
			
 
				 
			
 
				 A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
			
 
				-OpenCL and Gordon drivers. They usually start several workers to actually drive
			
 
				+and OpenCL drivers. They usually start several workers to actually drive
			
 
				 them.
			
 
				 
			
 
				 A @b{performance model} is a (dynamic or static) model of the performance of a
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -75,27 +75,6 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
				 
			
 
				 endif
			
 
				 
			
 
				-if STARPU_USE_GORDON
			
 
				-
			
 
				-SPU_CC ?= spu-gcc
			
 
				-SPU_LD ?= spu-ld
			
 
				-
			
 
				-SPULDFLAGS =
			
 
				-SPULIBS = -lblas #-lc -lgloss -lc
			
 
				-
			
 
				-.c.spuo:
			
 
				-	$(MKDIR_P) `dirname $@`
			
 
				-	$(SPU_CC) -c -fpic $< -o $@
			
 
				-
			
 
				-.spuo.spuelf:
			
 
				-	$(MKDIR_P) `dirname $@`
			
 
				-	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
			
 
				-
			
 
				-BUILT_SOURCES +=				\
			
 
				-	gordon/null_kernel_gordon.spuelf
			
 
				-
			
 
				-endif
			
 
				-
			
 
				 if STARPU_HAVE_ICC
			
 
				 .icc.o:
			
 
				 	$(ICC) -x c $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -227,10 +227,6 @@ endif
 
				 
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
			
 
				 
			
 
				-if STARPU_USE_GORDON
			
 
				-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/gordon/driver_gordon.c
			
 
				-endif
			
 
				-
			
 
				 if STARPU_USE_OPENCL
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl_utils.c
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -94,6 +94,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	size_t size = SIZE;
			
 
				 
			
 
				 	/* Initialize CUDA context on the device */
			
 
				+	/* We do not need to enable OpenGL interoperability at this point,
			
 
				+	 * since we cleanly shutdown CUDA before returning. */
			
 
				 	cudaSetDevice(dev);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
@@ -189,6 +191,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
			
 
				 
			
 
				 	/* Initialize CUDA context on the source */
			
 
				+	/* We do not need to enable OpenGL interoperability at this point,
			
 
				+	 * since we cleanly shutdown CUDA before returning. */
			
 
				 	cudaSetDevice(src);
			
 
				 
			
 
				 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
			
@@ -207,6 +211,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	cudaMemset(s_buffer, 0, size);
			
 
				 
			
 
				 	/* Initialize CUDA context on the destination */
			
 
				+	/* We do not need to enable OpenGL interoperability at this point,
			
 
				+	 * since we cleanly shutdown CUDA before returning. */
			
 
				 	cudaSetDevice(dst);
			
 
				 
			
 
				 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
			
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -133,6 +133,27 @@ copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
 
				 				    dst_interface, dst_node,
			
 
				 				    stream, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				+
			
 
				+static int
			
 
				+copy_cuda_to_cuda(void *src_interface, unsigned src_node,
			
 
				+		  void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	return copy_cuda_async_sync(src_interface, src_node,
			
 
				+				    dst_interface, dst_node,
			
 
				+				    NULL, cudaMemcpyDeviceToDevice);
			
 
				+}
			
 
				+
			
 
				+#ifdef NO_STRIDE
			
 
				+static int
			
 
				+copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				+			void *dst_interface, unsigned dst_node,
			
 
				+			cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_async_sync(src_interface, src_node,
			
 
				+				    dst_interface, dst_node,
			
 
				+				    stream, cudaMemcpyDeviceToDevice);
			
 
				+}
			
 
				+#endif /* !NO_STRIDE */
			
 
				 #endif /* !STARPU_USE_CUDA */
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -281,9 +302,9 @@ static struct starpu_data_copy_methods coo_copy_data_methods =
 
				 	.cuda_to_ram         = copy_cuda_to_ram,
			
 
				 	.ram_to_cuda_async   = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async   = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda        = NULL, /* TODO */
			
 
				+	.cuda_to_cuda        = copy_cuda_to_cuda,
			
 
				 #ifdef NO_STRIDE
			
 
				-	.cuda_to_cuda_async  = NULL, /* TODO */
			
 
				+	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #endif /* !STARPU_USE_CUDA */
			
 
				 #ifdef STARPU_USE_OPENCL
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -507,21 +507,64 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 
				 	STARPU_ABORT();
			
 
				 }
			
 
				 
			
 
				-int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+int
			
 
				+starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
			
 
				+			    void *dst_ptr, unsigned dst_node,
			
 
				+			    size_t ssize, cudaStream_t stream,
			
 
				+			    enum cudaMemcpyKind kind)
			
 
				 {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	int peer_copy = 0;
			
 
				+	int src_dev = -1, dst_dev = -1;
			
 
				+#endif
			
 
				 	cudaError_t cures = 0;
			
 
				 
			
 
				+	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
			
 
				+	{
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		peer_copy = 1;
			
 
				+		src_dev = _starpu_memory_node_to_devid(src_node);
			
 
				+		dst_dev = _starpu_memory_node_to_devid(dst_node);
			
 
				+#else
			
 
				+		STARPU_ABORT();
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				 	if (stream)
			
 
				 	{
			
 
				-	     _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-	     cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
			
 
				-	     _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		if (peer_copy)
			
 
				+		{
			
 
				+			cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
			
 
				+						    (char *) src_ptr, src_dev,
			
 
				+						    ssize, stream);
			
 
				+		}
			
 
				+		else
			
 
				+#endif
			
 
				+		{
			
 
				+			cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
			
 
				+		}
			
 
				+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				 	}
			
 
				+
			
 
				 	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
			
 
				 	if (stream == NULL || cures)
			
 
				 	{
			
 
				 		/* do it in a synchronous fashion */
			
 
				-		cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		if (peer_copy)
			
 
				+		{
			
 
				+			cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,
			
 
				+					       (char *) src_ptr, src_dev,
			
 
				+					       ssize);
			
 
				+		}
			
 
				+		else
			
 
				+#endif
			
 
				+		{
			
 
				+			cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
			
 
				+		}
			
 
				+		
			
 
				 
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -52,14 +52,68 @@ int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo)
 
				 	return fifo->ntasks == 0;
			
 
				 }
			
 
				 
			
 
				-/* TODO: revert front/back? */
			
 
				-int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				+int
			
 
				+_starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue,
			
 
				+			      pthread_mutex_t *sched_mutex,
			
 
				+			      pthread_cond_t *sched_cond,
			
 
				+			      struct starpu_task *task)
			
 
				 {
			
 
				+	struct starpu_task_list *list = &fifo_queue->taskq;
			
 
				+
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 
			
 
				 	_STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				-	/* TODO: if prio, put at back */
			
 
				-	starpu_task_list_push_front(&fifo_queue->taskq, task);
			
 
				+
			
 
				+	if (list->head == NULL)
			
 
				+	{
			
 
				+		list->head = task;
			
 
				+		list->tail = task;
			
 
				+		task->prev = NULL;
			
 
				+		task->next = NULL;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		struct starpu_task *current = list->head;
			
 
				+		struct starpu_task *prev = NULL;
			
 
				+
			
 
				+		while (current)
			
 
				+		{
			
 
				+			if (current->priority >= task->priority)
			
 
				+				break;
			
 
				+
			
 
				+			prev = current;
			
 
				+			current = current->next;
			
 
				+		}
			
 
				+
			
 
				+		if (prev == NULL)
			
 
				+		{
			
 
				+			/* Insert at the front of the list */
			
 
				+			list->head->prev = task;
			
 
				+			task->prev = NULL;
			
 
				+			task->next = list->head;
			
 
				+			list->head = task;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (current)
			
 
				+			{
			
 
				+				/* Insert between prev and current */
			
 
				+				task->prev = prev;
			
 
				+				prev->next = task;
			
 
				+				task->next = current;
			
 
				+				current->prev = task;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* Insert at the tail of the list */
			
 
				+				list->tail->next = task;
			
 
				+				task->next = NULL;
			
 
				+				task->prev = list->tail;
			
 
				+				list->tail = task;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	fifo_queue->ntasks++;
			
 
				 	fifo_queue->nprocessed++;
			
 
				 
			
@@ -69,6 +123,33 @@ int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/* TODO: revert front/back? */
			
 
				+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				+{
			
 
				+
			
 
				+	if (task->priority > 0)
			
 
				+	{
			
 
				+		_STARPU_TRACE_JOB_PUSH(task, 1);
			
 
				+		_starpu_fifo_push_sorted_task(fifo_queue, sched_mutex,
			
 
				+					      sched_cond, task);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				+
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+		starpu_task_list_push_front(&fifo_queue->taskq, task);
			
 
				+
			
 
				+		fifo_queue->ntasks++;
			
 
				+		fifo_queue->nprocessed++;
			
 
				+
			
 
				+		_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
			
 
				 {
			
 
				 	struct starpu_task *task;
			
--- a/src/sched_policies/fifo_queues.h
+++ b/src/sched_policies/fifo_queues.h
@@ -44,6 +44,11 @@ void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo);
 
				 
			
 
				 int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
			
 
				 
			
 
				+int _starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue,
			
 
				+				  pthread_mutex_t *sched_mutex,
			
 
				+				  pthread_cond_t *sched_cond,
			
 
				+				  struct starpu_task *task);
			
 
				+
			
 
				 int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
			
 
				 
			
 
				 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -77,24 +77,6 @@ NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src
 
				 
			
 
				 endif
			
 
				 
			
 
				-if STARPU_USE_GORDON
			
 
				-
			
 
				-SPU_CC ?= spu-gcc
			
 
				-SPU_LD ?= spu-ld
			
 
				-
			
 
				-.c.spuo:
			
 
				-	$(MKDIR_P) `dirname $@`
			
 
				-	$(SPU_CC) -c -fpic $< -o $@
			
 
				-
			
 
				-.spuo.spuelf:
			
 
				-	$(MKDIR_P) `dirname $@`
			
 
				-	$(SPU_LD) $< -o $@
			
 
				-
			
 
				-#BUILT_SOURCES +=
			
 
				-#	microbenchs/null_kernel_gordon.spuelf
			
 
				-
			
 
				-endif
			
 
				-
			
 
				 testbindir = $(libdir)/starpu/tests
			
 
				 
			
 
				 #####################################
			
@@ -120,10 +102,7 @@ if STARPU_COVERAGE_ENABLED
 
				 TESTS	+=	coverage/coverage.sh
			
 
				 endif
			
 
				 
			
 
				-starpu_machine_display_SOURCES	=	../tools/starpu_machine_display.c
			
 
				-
			
 
				 noinst_PROGRAMS =				\
			
 
				-	starpu_machine_display			\
			
 
				 	main/deprecated_func			\
			
 
				 	main/deprecated_buffer			\
			
 
				 	main/driver_api/init_run_deinit         \
			
@@ -385,16 +364,6 @@ datawizard_wt_host_SOURCES =			\
 
				 datawizard_wt_broadcast_SOURCES =		\
			
 
				 	datawizard/wt_broadcast.c
			
 
				 
			
 
				-if STARPU_USE_GORDON
			
 
				-datawizard_sync_and_notify_data_SOURCES +=	\
			
 
				-	datawizard/sync_and_notify_data_gordon_kernels.c
			
 
				-datawizard_sync_and_notify_data_implicit_SOURCES +=	\
			
 
				-	datawizard/sync_and_notify_data_gordon_kernels.c
			
 
				-BUILT_SOURCES += 						\
			
 
				-	datawizard/sync_and_notify_data_gordon_kernels.spuelf	\
			
 
				-	microbenchs/null_kernel_gordon.spuelf
			
 
				-endif
			
 
				-
			
 
				 ###################
			
 
				 # Block interface #
			
 
				 ###################