13 yıl önce · 41f3054013
--- a/ChangeLog
+++ b/ChangeLog
@@ -59,6 +59,9 @@ Changes:
 
																       of the libOpenCL.so file of the OCL ICD implementation.
															
 
																   * Fix main memory leak on multiple unregister/re-register.
															
 
																   * Improve hwloc detection by configure
															
 
																+  * Cell:
															
 
																+    - It is no longer possible to enable the cell support via the
															
 
																+      gordon driver
															
 
																 Small changes:
															
 
																   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
															
--- a/configure.ac
+++ b/configure.ac
@@ -775,72 +775,26 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
 
																    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
															
 
																 fi
															
 
																-
															
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																-#                                 Cell settings                               #
															
 
																+#                                 Cell                                        #
															
 
																 #                                                                             #
															
 
																 ###############################################################################
															
 
																-#TODO fix the default dir
															
 
																-AC_ARG_ENABLE(gordon, [AS_HELP_STRING([--enable-gordon],
															
 
																-			[use Cell SPUs])],, enable_gordon=maybe)
															
 
																-
															
 
																-if test x$enable_gordon = xyes -o x$enable_gordon = xmaybe; then
															
 
																-
															
 
																-	AC_ARG_WITH(gordon-dir, [AS_HELP_STRING([--with-gordon-dir=<path>],
															
 
																-			[specify Gordon installation directory (default is /usr/local/)])],
															
 
																-			[
															
 
																-				gordon_dir="$withval"
															
 
																-				enable_gordon=yes
															
 
																-			], gordon_dir=/usr/local/)
															
 
																-
															
 
																-	# do we have a valid Gordon setup ?
															
 
																-	have_valid_gordon=yes
															
 
																-
															
 
																-	# can we use dynamic code loading facilities ?
															
 
																-	AC_CHECK_LIB(elf, elf_memory,, [have_valid_gordon=no])
															
 
																-
															
 
																-	AC_CHECK_LIB(spe2, spe_context_create,,[have_valid_gordon=no])
															
 
																-	AC_CHECK_FUNC(spe_in_mbox_write, [], [have_valid_gordon=no])
															
 
																-
															
 
																-	PKG_PROG_PKG_CONFIG
															
 
																-	if test -d "$gordon_dir"; then
															
 
																-		PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:$gordon_dir"
															
 
																-	fi
															
 
																-	AC_SUBST(PKG_CONFIG_PATH)
															
 
																-	PKG_CHECK_MODULES([GORDON], [libgordon], [], have_valid_gordon=no)
															
 
																-
															
 
																-	CPPFLAGS="${CPPFLAGS} ${GORDON_CFLAGS}"
															
 
																-	LIBS="${LIBS} ${GORDON_LIBS}"
															
 
																-
															
 
																-	# AC_CHECK_FUNC(gordon_init, [gordon], [have_valid_gordon=no])
															
 
																-
															
 
																-	# in case Gordon was explicitely required, but is not available, this is an error
															
 
																-	if test x$enable_gordon = xyes -a x$have_valid_gordon = xno; then
															
 
																-		AC_MSG_ERROR([cannot find Gordon])
															
 
																-	fi
															
 
																-
															
 
																+# warning: Cell driver has been removed from configure, but as the
															
 
																+# source code is still available, we need to define the minimum
															
 
																+# requirements to compile
															
 
																+AC_DEFINE_UNQUOTED(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
															
 
																-	# now we enable Gordon if and only if a proper setup is available
															
 
																-	enable_gordon=$have_valid_gordon
															
 
																-	AC_DEFINE(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
															
 
																-fi
															
 
																-
															
 
																-AC_MSG_CHECKING(whether GORDON should be used)
															
 
																-AC_MSG_RESULT($enable_gordon)
															
 
																-AC_SUBST(STARPU_USE_GORDON, $enable_gordon)
															
 
																-AM_CONDITIONAL(STARPU_USE_GORDON, test x$enable_gordon = xyes)
															
 
																-
															
 
																-if test x$enable_gordon = xyes; then
															
 
																-	AC_DEFINE(STARPU_USE_GORDON, [1], [Cell support is enabled])
															
 
																-	GORDON_REQUIRES=gordon
															
 
																-fi
															
 
																-AC_SUBST(GORDON_REQUIRES)
															
 
																+###############################################################################
															
 
																+#                                                                             #
															
 
																+#                                 Drivers                                     #
															
 
																+#                                                                             #
															
 
																+###############################################################################
															
 
																 AC_MSG_CHECKING(whether blocking drivers should be disabled)
															
 
																 AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--disable-blocking-drivers], [disable blocking drivers])],
															
 
																-				enable_blocking=$enableval, enable_blocking=$enable_gordon)
															
 
																+				enable_blocking=$enableval, enable_blocking=no)
															
 
																 AC_MSG_RESULT($enable_blocking)
															
 
																 if test x$enable_blocking = xno; then
															
@@ -1015,8 +969,7 @@ AC_MSG_RESULT($nmaxbuffers)
 
																 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
															
 
																 		[how many buffers can be manipulated per task])
															
 
																-# We have one memory node shared by all CPU workers, one node per GPU, and
															
 
																-# currently the Cell driver is using the same memory node as the CPU.
															
 
																+# We have one memory node shared by all CPU workers, one node per GPU
															
 
																 maxnodes=1
															
 
																 if test x$enable_cuda = xyes ; then
															
 
																 	# we could have used nmaxcudadev + 1, but this would certainly give an
															
@@ -1406,12 +1359,6 @@ AC_ARG_ENABLE([socl],
 
																 AC_MSG_CHECKING(for SOCL)
															
 
																-if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
															
 
																-    if test "$have_valid_opencl" = "no" ; then
															
 
																-	STARPU_LOOK_FOR_OPENCL()
															
 
																-    fi
															
 
																-fi
															
 
																-
															
 
																 # in case SOCL was explicitely required, but is not available, this is an error
															
 
																 if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
															
 
																     AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
															
@@ -1865,7 +1812,6 @@ AC_MSG_NOTICE([
 
																 	CPUs   enabled: $enable_cpu
															
 
																 	CUDA   enabled: $enable_cuda
															
 
																 	OpenCL enabled: $enable_opencl
															
 
																-	Cell   enabled: $enable_gordon
															
 
																 	Compile-time limits
															
 
																 	(change these with --enable-maxcpus, --enable-maxcudadev,
															
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -61,9 +61,6 @@ Compare the data size of two interfaces.
 
																 @item @code{ void (*display)(starpu_data_handle_t handle, FILE *f)}
															
 
																 Dump the sizes of a handle to a file.
															
 
																-@item @code{ int (*convert_to_gordon)(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)}
															
 
																-Convert the data size to the spu size format. If no SPUs are used, this field can be seto NULL.
															
 
																-
															
 
																 @item @code{enum starpu_data_interface_id interfaceid}
															
 
																 An identifier that is unique to each interface.
															
@@ -534,9 +531,6 @@ Actual number of CUDA workers used by StarPU.
 
																 @item @code{unsigned nopenclgpus}
															
 
																 Actual number of OpenCL workers used by StarPU.
															
 
																-@item @code{unsigned ngordon_spus}
															
 
																-Actual number of Gordon workers used by StarPU.
															
 
																-
															
 
																 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
															
 
																 Indicates the successive cpu identifier that should be used to bind the
															
 
																 workers. It is either filled according to the user's explicit
															
@@ -699,7 +693,7 @@ assign tasks to the different workers.
 
																 Remove all available tasks from the scheduler (tasks are chained by the means
															
 
																 of the prev and next fields of the starpu_task structure). The mutex associated
															
 
																 to the worker is already taken when this method is called. This is currently
															
 
																-only used by the Gordon driver.
															
 
																+not used.
															
 
																 @item @code{void (*pre_exec_hook)(struct starpu_task *)} (optional)
															
 
																 This method is called every time a task is starting.
															
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -20,7 +20,6 @@
 
																 * Profiling API::
															
 
																 * CUDA extensions::
															
 
																 * OpenCL extensions::
															
 
																-* Cell extensions::
															
 
																 * Miscellaneous helpers::
															
 
																 @end menu
															
@@ -94,10 +93,6 @@ be specified with the @code{STARPU_NCUDA} environment variable.
 
																 This is the number of OpenCL devices that StarPU can use. This can
															
 
																 also be specified with the @code{STARPU_NOPENCL} environment variable.
															
 
																-@item @code{int nspus} (default = -1)
															
 
																-This is the number of Cell SPUs that StarPU can use. This can also be
															
 
																-specified with the @code{STARPU_NGORDON} environment variable.
															
 
																-
															
 
																 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
															
 
																 If this flag is set, the @code{workers_bindid} array indicates where the
															
 
																 different workers are bound, otherwise StarPU automatically selects where to
															
@@ -243,7 +238,6 @@ The different values are:
 
																 @item @code{STARPU_CPU_WORKER}
															
 
																 @item @code{STARPU_CUDA_WORKER}
															
 
																 @item @code{STARPU_OPENCL_WORKER}
															
 
																-@item @code{STARPU_GORDON_WORKER}
															
 
																 @end table
															
 
																 @end deftp
															
@@ -273,10 +267,6 @@ This function returns the number of OpenCL devices controlled by StarPU. The ret
 
																 value should be at most @code{STARPU_MAXOPENCLDEVS}.
															
 
																 @end deftypefun
															
 
																-@deftypefun unsigned starpu_spu_worker_get_count (void)
															
 
																-This function returns the number of Cell SPUs controlled by StarPU.
															
 
																-@end deftypefun
															
 
																-
															
 
																 @deftypefun int starpu_worker_get_id (void)
															
 
																 This function returns the identifier of the current worker, i.e the one associated to the calling
															
 
																 thread. The returned value is either -1 if the current context is not a StarPU
															
@@ -311,9 +301,8 @@ This function returns the type of processing unit associated to a
 
																 worker. The worker identifier is a value returned by the
															
 
																 @code{starpu_worker_get_id} function). The returned value
															
 
																 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
															
 
																-core, @code{STARPU_CUDA_WORKER} for a CUDA device,
															
 
																-@code{STARPU_OPENCL_WORKER} for a OpenCL device, and
															
 
																-@code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
															
 
																+core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
															
 
																+@code{STARPU_OPENCL_WORKER} for a OpenCL device. The value returned for an invalid
															
 
																 identifier is unspecified.
															
 
																 @end deftypefun
															
@@ -339,7 +328,6 @@ todo
 
																 @item @code{STARPU_CPU_RAM}
															
 
																 @item @code{STARPU_CUDA_RAM}
															
 
																 @item @code{STARPU_OPENCL_RAM}
															
 
																-@item @code{STARPU_SPU_LS}
															
 
																 @end table
															
 
																 @end deftp
															
@@ -1150,7 +1138,7 @@ Return the size of the elements registered into the matrix designated by @var{in
 
																 Return a pointer to the column array of the matrix designated by
															
 
																 @var{interface}.
															
 
																 @end defmac
															
 
																-@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE({void *}@var{interface})
															
 
																+@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE ({void *}@var{interface})
															
 
																 Return a device handle for the column array of the matrix designated by
															
 
																 @var{interface}, to be used on OpenCL. The offset documented below has to be
															
 
																 used in addition to this.
															
@@ -1158,7 +1146,7 @@ used in addition to this.
 
																 @defmac STARPU_COO_GET_ROWS (interface)
															
 
																 Return a pointer to the rows array of the matrix designated by @var{interface}.
															
 
																 @end defmac
															
 
																-@defmac STARPU_COO_GET_ROWS_DEV_HANDLE({void *}@var{interface})
															
 
																+@defmac STARPU_COO_GET_ROWS_DEV_HANDLE ({void *}@var{interface})
															
 
																 Return a device handle for the row array of the matrix designated by
															
 
																 @var{interface}, to be used on OpenCL. The offset documented below has to be
															
 
																 used in addition to this.
															
@@ -1167,12 +1155,12 @@ used in addition to this.
 
																 Return a pointer to the values array of the matrix designated by
															
 
																 @var{interface}.
															
 
																 @end defmac
															
 
																-@defmac STARPU_COO_GET_VALUES_DEV_HANDLE({void *}@var{interface})
															
 
																+@defmac STARPU_COO_GET_VALUES_DEV_HANDLE ({void *}@var{interface})
															
 
																 Return a device handle for the value array of the matrix designated by
															
 
																 @var{interface}, to be used on OpenCL. The offset documented below has to be
															
 
																 used in addition to this.
															
 
																 @end defmac
															
 
																-@defmac STARPU_COO_GET_OFFSET({void *}@var{itnerface})
															
 
																+@defmac STARPU_COO_GET_OFFSET ({void *}@var{itnerface})
															
 
																 Return the offset in the arrays of the COO matrix designated by @var{interface}.
															
 
																 @end defmac
															
 
																 @defmac STARPU_COO_GET_NX (interface)
															
@@ -1493,18 +1481,6 @@ starpu_codelet} to specify the codelet may be executed on a CUDA
 
																 processing unit.
															
 
																 @end defmac
															
 
																-@defmac STARPU_SPU
															
 
																-This macro is used when setting the field @code{where} of a @code{struct
															
 
																-starpu_codelet} to specify the codelet may be executed on a SPU
															
 
																-processing unit.
															
 
																-@end defmac
															
 
																-
															
 
																-@defmac STARPU_GORDON
															
 
																-This macro is used when setting the field @code{where} of a @code{struct
															
 
																-starpu_codelet} to specify the codelet may be executed on a Cell
															
 
																-processing unit.
															
 
																-@end defmac
															
 
																-
															
 
																 @defmac STARPU_OPENCL
															
 
																 This macro is used when setting the field @code{where} of a @code{struct
															
 
																 starpu_codelet} to specify the codelet may be executed on a OpenCL
															
@@ -1542,12 +1518,12 @@ e.g. static storage case.
 
																 @item @code{uint32_t where} (optional)
															
 
																 Indicates which types of processing units are able to execute the
															
 
																 codelet. The different values
															
 
																-@code{STARPU_CPU}, @code{STARPU_CUDA}, @code{STARPU_SPU},
															
 
																-@code{STARPU_GORDON}, @code{STARPU_OPENCL} can be combined to specify
															
 
																+@code{STARPU_CPU}, @code{STARPU_CUDA}, 
															
 
																+@code{STARPU_OPENCL} can be combined to specify
															
 
																 on which types of processing units the codelet can be executed.
															
 
																 @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
															
 
																-implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
															
 
																-indicates that it is only available on Cell SPUs. If the field is
															
 
																+implemented for both CPU cores and CUDA devices while @code{STARPU_OPENCL}
															
 
																+indicates that it is only available on OpenCL devices. If the field is
															
 
																 unset, its value will be automatically set based on the availability
															
 
																 of the @code{XXX_funcs} fields defined below.
															
@@ -1607,17 +1583,6 @@ If the @code{where} field is set, then the @code{opencl_funcs} field
 
																 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
															
 
																 field, it must be non-null otherwise.
															
 
																-@item @code{uint8_t gordon_func} (optional)
															
 
																-This field has been made deprecated. One should use instead the
															
 
																-@code{gordon_funcs} field.
															
 
																-
															
 
																-@item @code{uint8_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS]} (optional)
															
 
																-Is an array of index of the Cell SPU implementations of the codelet within the
															
 
																-Gordon library.
															
 
																-It must be terminated by a NULL value.
															
 
																-See Gordon documentation for more details on how to register a kernel and
															
 
																-retrieve its index.
															
 
																-
															
 
																 @item @code{unsigned nbuffers}
															
 
																 Specifies the number of arguments taken by the codelet. These arguments are
															
 
																 managed by the DSM and are accessed from the @code{void *buffers[]}
															
@@ -1726,17 +1691,17 @@ by the DSM.
 
																 @item @code{void *cl_arg} (optional; default: @code{NULL})
															
 
																 This pointer is passed to the codelet through the second argument
															
 
																 of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
															
 
																-In the specific case of the Cell processor, see the @code{cl_arg_size}
															
 
																-argument.
															
 
																-
															
 
																-@item @code{size_t cl_arg_size} (optional, Cell-specific)
															
 
																-In the case of the Cell processor, the @code{cl_arg} pointer is not directly
															
 
																-given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
															
 
																-the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
															
 
																-at address @code{cl_arg}. In this case, the argument given to the SPU codelet
															
 
																-is therefore not the @code{cl_arg} pointer, but the address of the buffer in
															
 
																-local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
															
 
																-codelets, where the @code{cl_arg} pointer is given as such.
															
 
																+
															
 
																+@item @code{size_t cl_arg_size} (optional)
															
 
																+For some specific drivers, the @code{cl_arg} pointer cannot not be directly
															
 
																+given to the driver function. A buffer of size @code{cl_arg_size}
															
 
																+needs to be allocated on the driver. This buffer is then filled with
															
 
																+the @code{cl_arg_size} bytes starting at address @code{cl_arg}. In
															
 
																+this case, the argument given to the codelet is therefore not the
															
 
																+@code{cl_arg} pointer, but the address of the buffer in local store
															
 
																+(LS) instead.
															
 
																+This field is ignored for CPU, CUDA and OpenCL codelets, where the
															
 
																+@code{cl_arg} pointer is given as such.
															
 
																 @item @code{void (*callback_func)(void *)} (optional) (default: @code{NULL})
															
 
																 This is a function pointer of prototype @code{void (*f)(void *)} which
															
@@ -2089,7 +2054,6 @@ OpenCL types range within STARPU_OPENCL_DEFAULT (GPU number 0), STARPU_OPENCL_DE
 
																 @item @code{STARPU_CPU_DEFAULT}
															
 
																 @item @code{STARPU_CUDA_DEFAULT}
															
 
																 @item @code{STARPU_OPENCL_DEFAULT}
															
 
																-@item @code{STARPU_GORDON_DEFAULT}
															
 
																 @end table
															
 
																 @end deftp
															
@@ -2742,11 +2706,6 @@ otherwise. The integer pointed to by @var{ret} is set to -EAGAIN if the asynchro
 
																 was successful, or to 0 if event was NULL.
															
 
																 @end deftypefun
															
 
																-@node Cell extensions
															
 
																-@section Cell extensions
															
 
																-
															
 
																-nothing yet.
															
 
																-
															
 
																 @node Miscellaneous helpers
															
 
																 @section Miscellaneous helpers
															
--- a/doc/chapters/basic-examples.texi
+++ b/doc/chapters/basic-examples.texi
@@ -161,7 +161,7 @@ struct starpu_codelet cl =
 
																 A codelet is a structure that represents a computational kernel. Such a codelet
															
 
																 may contain an implementation of the same kernel on different architectures
															
 
																-(e.g. CUDA, Cell's SPU, x86, ...). For compatibility, make sure that the whole
															
 
																+(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
															
 
																 structure is initialized to zero, either by using memset, or by letting the
															
 
																 compiler implicitly do it as examplified above.
															
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -121,13 +121,6 @@ Search for an OpenCL library under @var{dir}, which should notably
 
																 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
															
 
																 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
															
 
																-@item --enable-gordon
															
 
																-Enable the use of the Gordon runtime for Cell SPUs.
															
 
																-@c TODO: rather default to enabled when detected
															
 
																-
															
 
																-@item --with-gordon-dir=@var{prefix}
															
 
																-Search for the Gordon SDK under @var{prefix}.
															
 
																-
															
 
																 @item --enable-maximplementations=@var{count}
															
 
																 Allow for at most @var{count} codelet implementations for the same
															
 
																 target device.  This information is then available as the
															
@@ -292,9 +285,6 @@ create as many CUDA workers as there are CUDA devices.
 
																 @item @code{STARPU_NOPENCL}
															
 
																 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
															
 
																-@item @code{STARPU_NGORDON}
															
 
																-Specify the number of SPUs that StarPU can use.
															
 
																-
															
 
																 @item @code{STARPU_WORKERS_NOBIND}
															
 
																 Setting it to non-zero will prevent StarPU from binding its threads to
															
 
																 CPUs. This is for instance useful when running the testsuite in parallel.
															
@@ -309,7 +299,7 @@ determined by the OS, or provided by the @code{hwloc} library in case it is
 
																 available.
															
 
																 Note that the first workers correspond to the CUDA workers, then come the
															
 
																-OpenCL and the SPU, and finally the CPU workers. For example if
															
 
																+OpenCL workers, and finally the CPU workers. For example if
															
 
																 we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
															
 
																 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
															
 
																 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
															
--- a/doc/chapters/introduction.texi
+++ b/doc/chapters/introduction.texi
@@ -83,7 +83,7 @@ The remainder of this section describes the main concepts used in StarPU.
 
																 @cindex codelet
															
 
																 One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
															
 
																 computational kernel that can possibly be implemented on multiple architectures
															
 
																-such as a CPU, a CUDA device or a Cell's SPU.
															
 
																+such as a CPU, a CUDA device or an OpenCL device.
															
 
																 @c TODO insert illustration f: f_spu, f_cpu, ...
															
@@ -150,7 +150,7 @@ A @b{worker} execute tasks. There is typically one per CPU computation core and
 
																 one per accelerator (for which a whole CPU core is dedicated).
															
 
																 A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
															
 
																-OpenCL and Gordon drivers. They usually start several workers to actually drive
															
 
																+and OpenCL drivers. They usually start several workers to actually drive
															
 
																 them.
															
 
																 A @b{performance model} is a (dynamic or static) model of the performance of a
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -75,27 +75,6 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
																 endif
															
 
																-if STARPU_USE_GORDON
															
 
																-
															
 
																-SPU_CC ?= spu-gcc
															
 
																-SPU_LD ?= spu-ld
															
 
																-
															
 
																-SPULDFLAGS =
															
 
																-SPULIBS = -lblas #-lc -lgloss -lc
															
 
																-
															
 
																-.c.spuo:
															
 
																-	$(MKDIR_P) `dirname $@`
															
 
																-	$(SPU_CC) -c -fpic $< -o $@
															
 
																-
															
 
																-.spuo.spuelf:
															
 
																-	$(MKDIR_P) `dirname $@`
															
 
																-	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
															
 
																-
															
 
																-BUILT_SOURCES +=				\
															
 
																-	gordon/null_kernel_gordon.spuelf
															
 
																-
															
 
																-endif
															
 
																-
															
 
																 if STARPU_HAVE_ICC
															
 
																 .icc.o:
															
 
																 	$(ICC) -x c $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
															
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -227,10 +227,6 @@ endif
 
																 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
															
 
																-if STARPU_USE_GORDON
															
 
																-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/gordon/driver_gordon.c
															
 
																-endif
															
 
																-
															
 
																 if STARPU_USE_OPENCL
															
 
																 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
															
 
																 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl_utils.c
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -94,6 +94,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
																 	size_t size = SIZE;
															
 
																 	/* Initialize CUDA context on the device */
															
 
																+	/* We do not need to enable OpenGL interoperability at this point,
															
 
																+	 * since we cleanly shutdown CUDA before returning. */
															
 
																 	cudaSetDevice(dev);
															
 
																 	/* hack to avoid third party libs to rebind threads */
															
@@ -189,6 +191,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
															
 
																 	/* Initialize CUDA context on the source */
															
 
																+	/* We do not need to enable OpenGL interoperability at this point,
															
 
																+	 * since we cleanly shutdown CUDA before returning. */
															
 
																 	cudaSetDevice(src);
															
 
																 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
															
@@ -207,6 +211,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	cudaMemset(s_buffer, 0, size);
															
 
																 	/* Initialize CUDA context on the destination */
															
 
																+	/* We do not need to enable OpenGL interoperability at this point,
															
 
																+	 * since we cleanly shutdown CUDA before returning. */
															
 
																 	cudaSetDevice(dst);
															
 
																 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
															
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -133,6 +133,27 @@ copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
 
																 				    dst_interface, dst_node,
															
 
																 				    stream, cudaMemcpyDeviceToHost);
															
 
																 }
															
 
																+
															
 
																+static int
															
 
																+copy_cuda_to_cuda(void *src_interface, unsigned src_node,
															
 
																+		  void *dst_interface, unsigned dst_node)
															
 
																+{
															
 
																+	return copy_cuda_async_sync(src_interface, src_node,
															
 
																+				    dst_interface, dst_node,
															
 
																+				    NULL, cudaMemcpyDeviceToDevice);
															
 
																+}
															
 
																+
															
 
																+#ifdef NO_STRIDE
															
 
																+static int
															
 
																+copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
															
 
																+			void *dst_interface, unsigned dst_node,
															
 
																+			cudaStream_t stream)
															
 
																+{
															
 
																+	return copy_cuda_async_sync(src_interface, src_node,
															
 
																+				    dst_interface, dst_node,
															
 
																+				    stream, cudaMemcpyDeviceToDevice);
															
 
																+}
															
 
																+#endif /* !NO_STRIDE */
															
 
																 #endif /* !STARPU_USE_CUDA */
															
 
																 #ifdef STARPU_USE_OPENCL
															
@@ -281,9 +302,9 @@ static struct starpu_data_copy_methods coo_copy_data_methods =
 
																 	.cuda_to_ram         = copy_cuda_to_ram,
															
 
																 	.ram_to_cuda_async   = copy_ram_to_cuda_async,
															
 
																 	.cuda_to_ram_async   = copy_cuda_to_ram_async,
															
 
																-	.cuda_to_cuda        = NULL, /* TODO */
															
 
																+	.cuda_to_cuda        = copy_cuda_to_cuda,
															
 
																 #ifdef NO_STRIDE
															
 
																-	.cuda_to_cuda_async  = NULL, /* TODO */
															
 
																+	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
															
 
																 #endif
															
 
																 #endif /* !STARPU_USE_CUDA */
															
 
																 #ifdef STARPU_USE_OPENCL
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -507,21 +507,64 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 
																 	STARPU_ABORT();
															
 
																 }
															
 
																-int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
															
 
																+int
															
 
																+starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
															
 
																+			    void *dst_ptr, unsigned dst_node,
															
 
																+			    size_t ssize, cudaStream_t stream,
															
 
																+			    enum cudaMemcpyKind kind)
															
 
																 {
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																+	int peer_copy = 0;
															
 
																+	int src_dev = -1, dst_dev = -1;
															
 
																+#endif
															
 
																 	cudaError_t cures = 0;
															
 
																+	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
															
 
																+	{
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																+		peer_copy = 1;
															
 
																+		src_dev = _starpu_memory_node_to_devid(src_node);
															
 
																+		dst_dev = _starpu_memory_node_to_devid(dst_node);
															
 
																+#else
															
 
																+		STARPU_ABORT();
															
 
																+#endif
															
 
																+	}
															
 
																+
															
 
																 	if (stream)
															
 
																 	{
															
 
																-	     _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																-	     cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
															
 
																-	     _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																+		if (peer_copy)
															
 
																+		{
															
 
																+			cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
															
 
																+						    (char *) src_ptr, src_dev,
															
 
																+						    ssize, stream);
															
 
																+		}
															
 
																+		else
															
 
																+#endif
															
 
																+		{
															
 
																+			cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
															
 
																+		}
															
 
																+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																 	}
															
 
																+
															
 
																 	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
															
 
																 	if (stream == NULL || cures)
															
 
																 	{
															
 
																 		/* do it in a synchronous fashion */
															
 
																-		cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																+		if (peer_copy)
															
 
																+		{
															
 
																+			cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,
															
 
																+					       (char *) src_ptr, src_dev,
															
 
																+					       ssize);
															
 
																+		}
															
 
																+		else
															
 
																+#endif
															
 
																+		{
															
 
																+			cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
															
 
																+		}
															
 
																+		
															
 
																 		if (STARPU_UNLIKELY(cures))
															
 
																 			STARPU_CUDA_REPORT_ERROR(cures);
															
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -52,14 +52,68 @@ int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo)
 
																 	return fifo->ntasks == 0;
															
 
																 }
															
 
																-/* TODO: revert front/back? */
															
 
																-int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
															
 
																+int
															
 
																+_starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue,
															
 
																+			      pthread_mutex_t *sched_mutex,
															
 
																+			      pthread_cond_t *sched_cond,
															
 
																+			      struct starpu_task *task)
															
 
																 {
															
 
																+	struct starpu_task_list *list = &fifo_queue->taskq;
															
 
																+
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																 	_STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																-	/* TODO: if prio, put at back */
															
 
																-	starpu_task_list_push_front(&fifo_queue->taskq, task);
															
 
																+
															
 
																+	if (list->head == NULL)
															
 
																+	{
															
 
																+		list->head = task;
															
 
																+		list->tail = task;
															
 
																+		task->prev = NULL;
															
 
																+		task->next = NULL;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		struct starpu_task *current = list->head;
															
 
																+		struct starpu_task *prev = NULL;
															
 
																+
															
 
																+		while (current)
															
 
																+		{
															
 
																+			if (current->priority >= task->priority)
															
 
																+				break;
															
 
																+
															
 
																+			prev = current;
															
 
																+			current = current->next;
															
 
																+		}
															
 
																+
															
 
																+		if (prev == NULL)
															
 
																+		{
															
 
																+			/* Insert at the front of the list */
															
 
																+			list->head->prev = task;
															
 
																+			task->prev = NULL;
															
 
																+			task->next = list->head;
															
 
																+			list->head = task;
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			if (current)
															
 
																+			{
															
 
																+				/* Insert between prev and current */
															
 
																+				task->prev = prev;
															
 
																+				prev->next = task;
															
 
																+				task->next = current;
															
 
																+				current->prev = task;
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				/* Insert at the tail of the list */
															
 
																+				list->tail->next = task;
															
 
																+				task->next = NULL;
															
 
																+				task->prev = list->tail;
															
 
																+				list->tail = task;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																 	fifo_queue->ntasks++;
															
 
																 	fifo_queue->nprocessed++;
															
@@ -69,6 +123,33 @@ int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_
 
																 	return 0;
															
 
																 }
															
 
																+/* TODO: revert front/back? */
															
 
																+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
															
 
																+{
															
 
																+
															
 
																+	if (task->priority > 0)
															
 
																+	{
															
 
																+		_STARPU_TRACE_JOB_PUSH(task, 1);
															
 
																+		_starpu_fifo_push_sorted_task(fifo_queue, sched_mutex,
															
 
																+					      sched_cond, task);
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		_STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																+
															
 
																+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+		starpu_task_list_push_front(&fifo_queue->taskq, task);
															
 
																+
															
 
																+		fifo_queue->ntasks++;
															
 
																+		fifo_queue->nprocessed++;
															
 
																+
															
 
																+		_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
															
 
																 {
															
 
																 	struct starpu_task *task;
															
--- a/src/sched_policies/fifo_queues.h
+++ b/src/sched_policies/fifo_queues.h
@@ -44,6 +44,11 @@ void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo);
 
																 int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
															
 
																+int _starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue,
															
 
																+				  pthread_mutex_t *sched_mutex,
															
 
																+				  pthread_cond_t *sched_cond,
															
 
																+				  struct starpu_task *task);
															
 
																+
															
 
																 int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
															
 
																 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);
															
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -77,24 +77,6 @@ NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src
 
																 endif
															
 
																-if STARPU_USE_GORDON
															
 
																-
															
 
																-SPU_CC ?= spu-gcc
															
 
																-SPU_LD ?= spu-ld
															
 
																-
															
 
																-.c.spuo:
															
 
																-	$(MKDIR_P) `dirname $@`
															
 
																-	$(SPU_CC) -c -fpic $< -o $@
															
 
																-
															
 
																-.spuo.spuelf:
															
 
																-	$(MKDIR_P) `dirname $@`
															
 
																-	$(SPU_LD) $< -o $@
															
 
																-
															
 
																-#BUILT_SOURCES +=
															
 
																-#	microbenchs/null_kernel_gordon.spuelf
															
 
																-
															
 
																-endif
															
 
																-
															
 
																 testbindir = $(libdir)/starpu/tests
															
 
																 #####################################
															
@@ -120,10 +102,7 @@ if STARPU_COVERAGE_ENABLED
 
																 TESTS	+=	coverage/coverage.sh
															
 
																 endif
															
 
																-starpu_machine_display_SOURCES	=	../tools/starpu_machine_display.c
															
 
																-
															
 
																 noinst_PROGRAMS =				\
															
 
																-	starpu_machine_display			\
															
 
																 	main/deprecated_func			\
															
 
																 	main/deprecated_buffer			\
															
 
																 	main/driver_api/init_run_deinit         \
															
@@ -385,16 +364,6 @@ datawizard_wt_host_SOURCES =			\
 
																 datawizard_wt_broadcast_SOURCES =		\
															
 
																 	datawizard/wt_broadcast.c
															
 
																-if STARPU_USE_GORDON
															
 
																-datawizard_sync_and_notify_data_SOURCES +=	\
															
 
																-	datawizard/sync_and_notify_data_gordon_kernels.c
															
 
																-datawizard_sync_and_notify_data_implicit_SOURCES +=	\
															
 
																-	datawizard/sync_and_notify_data_gordon_kernels.c
															
 
																-BUILT_SOURCES += 						\
															
 
																-	datawizard/sync_and_notify_data_gordon_kernels.spuelf	\
															
 
																-	microbenchs/null_kernel_gordon.spuelf
															
 
																-endif
															
 
																-
															
 
																 ###################
															
 
																 # Block interface #
															
 
																 ###################