Andra Hugo 13 gadi atpakaļ
vecāks
revīzija
41f3054013

+ 3 - 0
ChangeLog

@@ -59,6 +59,9 @@ Changes:
       of the libOpenCL.so file of the OCL ICD implementation.
   * Fix main memory leak on multiple unregister/re-register.
   * Improve hwloc detection by configure
+  * Cell:
+    - It is no longer possible to enable the cell support via the
+      gordon driver
 
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is

+ 12 - 66
configure.ac

@@ -775,72 +775,26 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
 fi
 
-
 ###############################################################################
 #                                                                             #
-#                                 Cell settings                               #
+#                                 Cell                                        #
 #                                                                             #
 ###############################################################################
 
-#TODO fix the default dir
-AC_ARG_ENABLE(gordon, [AS_HELP_STRING([--enable-gordon],
-			[use Cell SPUs])],, enable_gordon=maybe)
-
-if test x$enable_gordon = xyes -o x$enable_gordon = xmaybe; then
-
-	AC_ARG_WITH(gordon-dir, [AS_HELP_STRING([--with-gordon-dir=<path>],
-			[specify Gordon installation directory (default is /usr/local/)])],
-			[
-				gordon_dir="$withval"
-				enable_gordon=yes
-			], gordon_dir=/usr/local/)
-
-	# do we have a valid Gordon setup ?
-	have_valid_gordon=yes
-
-	# can we use dynamic code loading facilities ?
-	AC_CHECK_LIB(elf, elf_memory,, [have_valid_gordon=no])
-
-	AC_CHECK_LIB(spe2, spe_context_create,,[have_valid_gordon=no])
-	AC_CHECK_FUNC(spe_in_mbox_write, [], [have_valid_gordon=no])
-
-	PKG_PROG_PKG_CONFIG
-	if test -d "$gordon_dir"; then
-		PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:$gordon_dir"
-	fi
-	AC_SUBST(PKG_CONFIG_PATH)
-	PKG_CHECK_MODULES([GORDON], [libgordon], [], have_valid_gordon=no)
-
-	CPPFLAGS="${CPPFLAGS} ${GORDON_CFLAGS}"
-	LIBS="${LIBS} ${GORDON_LIBS}"
-
-	# AC_CHECK_FUNC(gordon_init, [gordon], [have_valid_gordon=no])
-
-	# in case Gordon was explicitely required, but is not available, this is an error
-	if test x$enable_gordon = xyes -a x$have_valid_gordon = xno; then
-		AC_MSG_ERROR([cannot find Gordon])
-	fi
-
+# warning: Cell driver has been removed from configure, but as the
+# source code is still available, we need to define the minimum
+# requirements to compile
+AC_DEFINE_UNQUOTED(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
 
-	# now we enable Gordon if and only if a proper setup is available
-	enable_gordon=$have_valid_gordon
-	AC_DEFINE(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
-fi
-
-AC_MSG_CHECKING(whether GORDON should be used)
-AC_MSG_RESULT($enable_gordon)
-AC_SUBST(STARPU_USE_GORDON, $enable_gordon)
-AM_CONDITIONAL(STARPU_USE_GORDON, test x$enable_gordon = xyes)
-
-if test x$enable_gordon = xyes; then
-	AC_DEFINE(STARPU_USE_GORDON, [1], [Cell support is enabled])
-	GORDON_REQUIRES=gordon
-fi
-AC_SUBST(GORDON_REQUIRES)
+###############################################################################
+#                                                                             #
+#                                 Drivers                                     #
+#                                                                             #
+###############################################################################
 
 AC_MSG_CHECKING(whether blocking drivers should be disabled)
 AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--disable-blocking-drivers], [disable blocking drivers])],
-				enable_blocking=$enableval, enable_blocking=$enable_gordon)
+				enable_blocking=$enableval, enable_blocking=no)
 AC_MSG_RESULT($enable_blocking)
 
 if test x$enable_blocking = xno; then
@@ -1015,8 +969,7 @@ AC_MSG_RESULT($nmaxbuffers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 		[how many buffers can be manipulated per task])
 
-# We have one memory node shared by all CPU workers, one node per GPU, and
-# currently the Cell driver is using the same memory node as the CPU.
+# We have one memory node shared by all CPU workers, one node per GPU
 maxnodes=1
 if test x$enable_cuda = xyes ; then
 	# we could have used nmaxcudadev + 1, but this would certainly give an
@@ -1406,12 +1359,6 @@ AC_ARG_ENABLE([socl],
 
 AC_MSG_CHECKING(for SOCL)
 
-if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
-    if test "$have_valid_opencl" = "no" ; then
-	STARPU_LOOK_FOR_OPENCL()
-    fi
-fi
-
 # in case SOCL was explicitely required, but is not available, this is an error
 if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
     AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
@@ -1865,7 +1812,6 @@ AC_MSG_NOTICE([
 	CPUs   enabled: $enable_cpu
 	CUDA   enabled: $enable_cuda
 	OpenCL enabled: $enable_opencl
-	Cell   enabled: $enable_gordon
 
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,

+ 1 - 7
doc/chapters/advanced-api.texi

@@ -61,9 +61,6 @@ Compare the data size of two interfaces.
 @item @code{ void (*display)(starpu_data_handle_t handle, FILE *f)}
 Dump the sizes of a handle to a file.
 
-@item @code{ int (*convert_to_gordon)(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)}
-Convert the data size to the spu size format. If no SPUs are used, this field can be seto NULL.
-
 @item @code{enum starpu_data_interface_id interfaceid}
 An identifier that is unique to each interface.
 
@@ -534,9 +531,6 @@ Actual number of CUDA workers used by StarPU.
 @item @code{unsigned nopenclgpus}
 Actual number of OpenCL workers used by StarPU.
 
-@item @code{unsigned ngordon_spus}
-Actual number of Gordon workers used by StarPU.
-
 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
 Indicates the successive cpu identifier that should be used to bind the
 workers. It is either filled according to the user's explicit
@@ -699,7 +693,7 @@ assign tasks to the different workers.
 Remove all available tasks from the scheduler (tasks are chained by the means
 of the prev and next fields of the starpu_task structure). The mutex associated
 to the worker is already taken when this method is called. This is currently
-only used by the Gordon driver.
+not used.
 
 @item @code{void (*pre_exec_hook)(struct starpu_task *)} (optional)
 This method is called every time a task is starting.

+ 21 - 62
doc/chapters/basic-api.texi

@@ -20,7 +20,6 @@
 * Profiling API::
 * CUDA extensions::
 * OpenCL extensions::
-* Cell extensions::
 * Miscellaneous helpers::
 @end menu
 
@@ -94,10 +93,6 @@ be specified with the @code{STARPU_NCUDA} environment variable.
 This is the number of OpenCL devices that StarPU can use. This can
 also be specified with the @code{STARPU_NOPENCL} environment variable.
 
-@item @code{int nspus} (default = -1)
-This is the number of Cell SPUs that StarPU can use. This can also be
-specified with the @code{STARPU_NGORDON} environment variable.
-
 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
 If this flag is set, the @code{workers_bindid} array indicates where the
 different workers are bound, otherwise StarPU automatically selects where to
@@ -243,7 +238,6 @@ The different values are:
 @item @code{STARPU_CPU_WORKER}
 @item @code{STARPU_CUDA_WORKER}
 @item @code{STARPU_OPENCL_WORKER}
-@item @code{STARPU_GORDON_WORKER}
 @end table
 @end deftp
 
@@ -273,10 +267,6 @@ This function returns the number of OpenCL devices controlled by StarPU. The ret
 value should be at most @code{STARPU_MAXOPENCLDEVS}.
 @end deftypefun
 
-@deftypefun unsigned starpu_spu_worker_get_count (void)
-This function returns the number of Cell SPUs controlled by StarPU.
-@end deftypefun
-
 @deftypefun int starpu_worker_get_id (void)
 This function returns the identifier of the current worker, i.e the one associated to the calling
 thread. The returned value is either -1 if the current context is not a StarPU
@@ -311,9 +301,8 @@ This function returns the type of processing unit associated to a
 worker. The worker identifier is a value returned by the
 @code{starpu_worker_get_id} function). The returned value
 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
-core, @code{STARPU_CUDA_WORKER} for a CUDA device,
-@code{STARPU_OPENCL_WORKER} for a OpenCL device, and
-@code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
+core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
+@code{STARPU_OPENCL_WORKER} for a OpenCL device. The value returned for an invalid
 identifier is unspecified.
 @end deftypefun
 
@@ -339,7 +328,6 @@ todo
 @item @code{STARPU_CPU_RAM}
 @item @code{STARPU_CUDA_RAM}
 @item @code{STARPU_OPENCL_RAM}
-@item @code{STARPU_SPU_LS}
 @end table
 @end deftp
 
@@ -1150,7 +1138,7 @@ Return the size of the elements registered into the matrix designated by @var{in
 Return a pointer to the column array of the matrix designated by
 @var{interface}.
 @end defmac
-@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE({void *}@var{interface})
+@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE ({void *}@var{interface})
 Return a device handle for the column array of the matrix designated by
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 used in addition to this.
@@ -1158,7 +1146,7 @@ used in addition to this.
 @defmac STARPU_COO_GET_ROWS (interface)
 Return a pointer to the rows array of the matrix designated by @var{interface}.
 @end defmac
-@defmac STARPU_COO_GET_ROWS_DEV_HANDLE({void *}@var{interface})
+@defmac STARPU_COO_GET_ROWS_DEV_HANDLE ({void *}@var{interface})
 Return a device handle for the row array of the matrix designated by
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 used in addition to this.
@@ -1167,12 +1155,12 @@ used in addition to this.
 Return a pointer to the values array of the matrix designated by
 @var{interface}.
 @end defmac
-@defmac STARPU_COO_GET_VALUES_DEV_HANDLE({void *}@var{interface})
+@defmac STARPU_COO_GET_VALUES_DEV_HANDLE ({void *}@var{interface})
 Return a device handle for the value array of the matrix designated by
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 used in addition to this.
 @end defmac
-@defmac STARPU_COO_GET_OFFSET({void *}@var{itnerface})
+@defmac STARPU_COO_GET_OFFSET ({void *}@var{itnerface})
 Return the offset in the arrays of the COO matrix designated by @var{interface}.
 @end defmac
 @defmac STARPU_COO_GET_NX (interface)
@@ -1493,18 +1481,6 @@ starpu_codelet} to specify the codelet may be executed on a CUDA
 processing unit.
 @end defmac
 
-@defmac STARPU_SPU
-This macro is used when setting the field @code{where} of a @code{struct
-starpu_codelet} to specify the codelet may be executed on a SPU
-processing unit.
-@end defmac
-
-@defmac STARPU_GORDON
-This macro is used when setting the field @code{where} of a @code{struct
-starpu_codelet} to specify the codelet may be executed on a Cell
-processing unit.
-@end defmac
-
 @defmac STARPU_OPENCL
 This macro is used when setting the field @code{where} of a @code{struct
 starpu_codelet} to specify the codelet may be executed on a OpenCL
@@ -1542,12 +1518,12 @@ e.g. static storage case.
 @item @code{uint32_t where} (optional)
 Indicates which types of processing units are able to execute the
 codelet. The different values
-@code{STARPU_CPU}, @code{STARPU_CUDA}, @code{STARPU_SPU},
-@code{STARPU_GORDON}, @code{STARPU_OPENCL} can be combined to specify
+@code{STARPU_CPU}, @code{STARPU_CUDA}, 
+@code{STARPU_OPENCL} can be combined to specify
 on which types of processing units the codelet can be executed.
 @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
-implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
-indicates that it is only available on Cell SPUs. If the field is
+implemented for both CPU cores and CUDA devices while @code{STARPU_OPENCL}
+indicates that it is only available on OpenCL devices. If the field is
 unset, its value will be automatically set based on the availability
 of the @code{XXX_funcs} fields defined below.
 
@@ -1607,17 +1583,6 @@ If the @code{where} field is set, then the @code{opencl_funcs} field
 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
 field, it must be non-null otherwise.
 
-@item @code{uint8_t gordon_func} (optional)
-This field has been made deprecated. One should use instead the
-@code{gordon_funcs} field.
-
-@item @code{uint8_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS]} (optional)
-Is an array of index of the Cell SPU implementations of the codelet within the
-Gordon library.
-It must be terminated by a NULL value.
-See Gordon documentation for more details on how to register a kernel and
-retrieve its index.
-
 @item @code{unsigned nbuffers}
 Specifies the number of arguments taken by the codelet. These arguments are
 managed by the DSM and are accessed from the @code{void *buffers[]}
@@ -1726,17 +1691,17 @@ by the DSM.
 @item @code{void *cl_arg} (optional; default: @code{NULL})
 This pointer is passed to the codelet through the second argument
 of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
-In the specific case of the Cell processor, see the @code{cl_arg_size}
-argument.
-
-@item @code{size_t cl_arg_size} (optional, Cell-specific)
-In the case of the Cell processor, the @code{cl_arg} pointer is not directly
-given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
-the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
-at address @code{cl_arg}. In this case, the argument given to the SPU codelet
-is therefore not the @code{cl_arg} pointer, but the address of the buffer in
-local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
-codelets, where the @code{cl_arg} pointer is given as such.
+
+@item @code{size_t cl_arg_size} (optional)
+For some specific drivers, the @code{cl_arg} pointer cannot not be directly
+given to the driver function. A buffer of size @code{cl_arg_size}
+needs to be allocated on the driver. This buffer is then filled with
+the @code{cl_arg_size} bytes starting at address @code{cl_arg}. In
+this case, the argument given to the codelet is therefore not the
+@code{cl_arg} pointer, but the address of the buffer in local store
+(LS) instead.
+This field is ignored for CPU, CUDA and OpenCL codelets, where the
+@code{cl_arg} pointer is given as such.
 
 @item @code{void (*callback_func)(void *)} (optional) (default: @code{NULL})
 This is a function pointer of prototype @code{void (*f)(void *)} which
@@ -2089,7 +2054,6 @@ OpenCL types range within STARPU_OPENCL_DEFAULT (GPU number 0), STARPU_OPENCL_DE
 @item @code{STARPU_CPU_DEFAULT}
 @item @code{STARPU_CUDA_DEFAULT}
 @item @code{STARPU_OPENCL_DEFAULT}
-@item @code{STARPU_GORDON_DEFAULT}
 @end table
 @end deftp
 
@@ -2742,11 +2706,6 @@ otherwise. The integer pointed to by @var{ret} is set to -EAGAIN if the asynchro
 was successful, or to 0 if event was NULL.
 @end deftypefun
 
-@node Cell extensions
-@section Cell extensions
-
-nothing yet.
-
 @node Miscellaneous helpers
 @section Miscellaneous helpers
 

+ 1 - 1
doc/chapters/basic-examples.texi

@@ -161,7 +161,7 @@ struct starpu_codelet cl =
 
 A codelet is a structure that represents a computational kernel. Such a codelet
 may contain an implementation of the same kernel on different architectures
-(e.g. CUDA, Cell's SPU, x86, ...). For compatibility, make sure that the whole
+(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
 structure is initialized to zero, either by using memset, or by letting the
 compiler implicitly do it as examplified above.
 

+ 1 - 11
doc/chapters/configuration.texi

@@ -121,13 +121,6 @@ Search for an OpenCL library under @var{dir}, which should notably
 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
 
-@item --enable-gordon
-Enable the use of the Gordon runtime for Cell SPUs.
-@c TODO: rather default to enabled when detected
-
-@item --with-gordon-dir=@var{prefix}
-Search for the Gordon SDK under @var{prefix}.
-
 @item --enable-maximplementations=@var{count}
 Allow for at most @var{count} codelet implementations for the same
 target device.  This information is then available as the
@@ -292,9 +285,6 @@ create as many CUDA workers as there are CUDA devices.
 @item @code{STARPU_NOPENCL}
 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
 
-@item @code{STARPU_NGORDON}
-Specify the number of SPUs that StarPU can use.
-
 @item @code{STARPU_WORKERS_NOBIND}
 Setting it to non-zero will prevent StarPU from binding its threads to
 CPUs. This is for instance useful when running the testsuite in parallel.
@@ -309,7 +299,7 @@ determined by the OS, or provided by the @code{hwloc} library in case it is
 available.
 
 Note that the first workers correspond to the CUDA workers, then come the
-OpenCL and the SPU, and finally the CPU workers. For example if
+OpenCL workers, and finally the CPU workers. For example if
 we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and

+ 2 - 2
doc/chapters/introduction.texi

@@ -83,7 +83,7 @@ The remainder of this section describes the main concepts used in StarPU.
 @cindex codelet
 One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
 computational kernel that can possibly be implemented on multiple architectures
-such as a CPU, a CUDA device or a Cell's SPU.
+such as a CPU, a CUDA device or an OpenCL device.
 
 @c TODO insert illustration f: f_spu, f_cpu, ...
 
@@ -150,7 +150,7 @@ A @b{worker} execute tasks. There is typically one per CPU computation core and
 one per accelerator (for which a whole CPU core is dedicated).
 
 A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
-OpenCL and Gordon drivers. They usually start several workers to actually drive
+and OpenCL drivers. They usually start several workers to actually drive
 them.
 
 A @b{performance model} is a (dynamic or static) model of the performance of a

+ 0 - 21
examples/Makefile.am

@@ -75,27 +75,6 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
 endif
 
-if STARPU_USE_GORDON
-
-SPU_CC ?= spu-gcc
-SPU_LD ?= spu-ld
-
-SPULDFLAGS =
-SPULIBS = -lblas #-lc -lgloss -lc
-
-.c.spuo:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_CC) -c -fpic $< -o $@
-
-.spuo.spuelf:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
-
-BUILT_SOURCES +=				\
-	gordon/null_kernel_gordon.spuelf
-
-endif
-
 if STARPU_HAVE_ICC
 .icc.o:
 	$(ICC) -x c $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \

+ 0 - 4
src/Makefile.am

@@ -227,10 +227,6 @@ endif
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
 
-if STARPU_USE_GORDON
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/gordon/driver_gordon.c
-endif
-
 if STARPU_USE_OPENCL
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl_utils.c

+ 6 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -94,6 +94,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	size_t size = SIZE;
 
 	/* Initialize CUDA context on the device */
+	/* We do not need to enable OpenGL interoperability at this point,
+	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(dev);
 
 	/* hack to avoid third party libs to rebind threads */
@@ -189,6 +191,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
 
 	/* Initialize CUDA context on the source */
+	/* We do not need to enable OpenGL interoperability at this point,
+	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(src);
 
 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
@@ -207,6 +211,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	cudaMemset(s_buffer, 0, size);
 
 	/* Initialize CUDA context on the destination */
+	/* We do not need to enable OpenGL interoperability at this point,
+	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(dst);
 
 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {

+ 23 - 2
src/datawizard/interfaces/coo_interface.c

@@ -133,6 +133,27 @@ copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
 				    dst_interface, dst_node,
 				    stream, cudaMemcpyDeviceToHost);
 }
+
+static int
+copy_cuda_to_cuda(void *src_interface, unsigned src_node,
+		  void *dst_interface, unsigned dst_node)
+{
+	return copy_cuda_async_sync(src_interface, src_node,
+				    dst_interface, dst_node,
+				    NULL, cudaMemcpyDeviceToDevice);
+}
+
+#ifdef NO_STRIDE
+static int
+copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
+			void *dst_interface, unsigned dst_node,
+			cudaStream_t stream)
+{
+	return copy_cuda_async_sync(src_interface, src_node,
+				    dst_interface, dst_node,
+				    stream, cudaMemcpyDeviceToDevice);
+}
+#endif /* !NO_STRIDE */
 #endif /* !STARPU_USE_CUDA */
 
 #ifdef STARPU_USE_OPENCL
@@ -281,9 +302,9 @@ static struct starpu_data_copy_methods coo_copy_data_methods =
 	.cuda_to_ram         = copy_cuda_to_ram,
 	.ram_to_cuda_async   = copy_ram_to_cuda_async,
 	.cuda_to_ram_async   = copy_cuda_to_ram_async,
-	.cuda_to_cuda        = NULL, /* TODO */
+	.cuda_to_cuda        = copy_cuda_to_cuda,
 #ifdef NO_STRIDE
-	.cuda_to_cuda_async  = NULL, /* TODO */
+	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
 #endif
 #endif /* !STARPU_USE_CUDA */
 #ifdef STARPU_USE_OPENCL

+ 48 - 5
src/drivers/cuda/driver_cuda.c

@@ -507,21 +507,64 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 	STARPU_ABORT();
 }
 
-int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
+int
+starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
+			    void *dst_ptr, unsigned dst_node,
+			    size_t ssize, cudaStream_t stream,
+			    enum cudaMemcpyKind kind)
 {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	int peer_copy = 0;
+	int src_dev = -1, dst_dev = -1;
+#endif
 	cudaError_t cures = 0;
 
+	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
+	{
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		peer_copy = 1;
+		src_dev = _starpu_memory_node_to_devid(src_node);
+		dst_dev = _starpu_memory_node_to_devid(dst_node);
+#else
+		STARPU_ABORT();
+#endif
+	}
+
 	if (stream)
 	{
-	     _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-	     cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
-	     _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		if (peer_copy)
+		{
+			cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
+						    (char *) src_ptr, src_dev,
+						    ssize, stream);
+		}
+		else
+#endif
+		{
+			cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
+		}
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 	}
+
 	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
 	if (stream == NULL || cures)
 	{
 		/* do it in a synchronous fashion */
-		cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		if (peer_copy)
+		{
+			cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,
+					       (char *) src_ptr, src_dev,
+					       ssize);
+		}
+		else
+#endif
+		{
+			cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
+		}
+		
 
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);

+ 85 - 4
src/sched_policies/fifo_queues.c

@@ -52,14 +52,68 @@ int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo)
 	return fifo->ntasks == 0;
 }
 
-/* TODO: revert front/back? */
-int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
+int
+_starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue,
+			      pthread_mutex_t *sched_mutex,
+			      pthread_cond_t *sched_cond,
+			      struct starpu_task *task)
 {
+	struct starpu_task_list *list = &fifo_queue->taskq;
+
 	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 
 	_STARPU_TRACE_JOB_PUSH(task, 0);
-	/* TODO: if prio, put at back */
-	starpu_task_list_push_front(&fifo_queue->taskq, task);
+
+	if (list->head == NULL)
+	{
+		list->head = task;
+		list->tail = task;
+		task->prev = NULL;
+		task->next = NULL;
+	}
+	else
+	{
+		struct starpu_task *current = list->head;
+		struct starpu_task *prev = NULL;
+
+		while (current)
+		{
+			if (current->priority >= task->priority)
+				break;
+
+			prev = current;
+			current = current->next;
+		}
+
+		if (prev == NULL)
+		{
+			/* Insert at the front of the list */
+			list->head->prev = task;
+			task->prev = NULL;
+			task->next = list->head;
+			list->head = task;
+		}
+		else
+		{
+			if (current)
+			{
+				/* Insert between prev and current */
+				task->prev = prev;
+				prev->next = task;
+				task->next = current;
+				current->prev = task;
+			}
+			else
+			{
+				/* Insert at the tail of the list */
+				list->tail->next = task;
+				task->next = NULL;
+				task->prev = list->tail;
+				list->tail = task;
+			}
+		}
+	}
+
 	fifo_queue->ntasks++;
 	fifo_queue->nprocessed++;
 
@@ -69,6 +123,33 @@ int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_
 	return 0;
 }
 
+/* TODO: revert front/back? */
+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
+{
+
+	if (task->priority > 0)
+	{
+		_STARPU_TRACE_JOB_PUSH(task, 1);
+		_starpu_fifo_push_sorted_task(fifo_queue, sched_mutex,
+					      sched_cond, task);
+	}
+	else
+	{
+		_STARPU_TRACE_JOB_PUSH(task, 0);
+
+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+		starpu_task_list_push_front(&fifo_queue->taskq, task);
+
+		fifo_queue->ntasks++;
+		fifo_queue->nprocessed++;
+
+		_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	}
+
+	return 0;
+}
+
 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
 {
 	struct starpu_task *task;

+ 5 - 0
src/sched_policies/fifo_queues.h

@@ -44,6 +44,11 @@ void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo);
 
 int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
 
+int _starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue,
+				  pthread_mutex_t *sched_mutex,
+				  pthread_cond_t *sched_cond,
+				  struct starpu_task *task);
+
 int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
 
 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);

+ 0 - 31
tests/Makefile.am

@@ -77,24 +77,6 @@ NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src
 
 endif
 
-if STARPU_USE_GORDON
-
-SPU_CC ?= spu-gcc
-SPU_LD ?= spu-ld
-
-.c.spuo:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_CC) -c -fpic $< -o $@
-
-.spuo.spuelf:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_LD) $< -o $@
-
-#BUILT_SOURCES +=
-#	microbenchs/null_kernel_gordon.spuelf
-
-endif
-
 testbindir = $(libdir)/starpu/tests
 
 #####################################
@@ -120,10 +102,7 @@ if STARPU_COVERAGE_ENABLED
 TESTS	+=	coverage/coverage.sh
 endif
 
-starpu_machine_display_SOURCES	=	../tools/starpu_machine_display.c
-
 noinst_PROGRAMS =				\
-	starpu_machine_display			\
 	main/deprecated_func			\
 	main/deprecated_buffer			\
 	main/driver_api/init_run_deinit         \
@@ -385,16 +364,6 @@ datawizard_wt_host_SOURCES =			\
 datawizard_wt_broadcast_SOURCES =		\
 	datawizard/wt_broadcast.c
 
-if STARPU_USE_GORDON
-datawizard_sync_and_notify_data_SOURCES +=	\
-	datawizard/sync_and_notify_data_gordon_kernels.c
-datawizard_sync_and_notify_data_implicit_SOURCES +=	\
-	datawizard/sync_and_notify_data_gordon_kernels.c
-BUILT_SOURCES += 						\
-	datawizard/sync_and_notify_data_gordon_kernels.spuelf	\
-	microbenchs/null_kernel_gordon.spuelf
-endif
-
 ###################
 # Block interface #
 ###################