Browse Source

new merge

Andra Hugo 13 years ago
parent
commit
41f3054013

+ 3 - 0
ChangeLog

@@ -59,6 +59,9 @@ Changes:
       of the libOpenCL.so file of the OCL ICD implementation.
       of the libOpenCL.so file of the OCL ICD implementation.
   * Fix main memory leak on multiple unregister/re-register.
   * Fix main memory leak on multiple unregister/re-register.
   * Improve hwloc detection by configure
   * Improve hwloc detection by configure
+  * Cell:
+    - It is no longer possible to enable the cell support via the
+      gordon driver
 
 
 Small changes:
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is

+ 12 - 66
configure.ac

@@ -775,72 +775,26 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
 fi
 fi
 
 
-
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
-#                                 Cell settings                               #
+#                                 Cell                                        #
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
 
 
-#TODO fix the default dir
-AC_ARG_ENABLE(gordon, [AS_HELP_STRING([--enable-gordon],
-			[use Cell SPUs])],, enable_gordon=maybe)
-
-if test x$enable_gordon = xyes -o x$enable_gordon = xmaybe; then
-
-	AC_ARG_WITH(gordon-dir, [AS_HELP_STRING([--with-gordon-dir=<path>],
-			[specify Gordon installation directory (default is /usr/local/)])],
-			[
-				gordon_dir="$withval"
-				enable_gordon=yes
-			], gordon_dir=/usr/local/)
-
-	# do we have a valid Gordon setup ?
-	have_valid_gordon=yes
-
-	# can we use dynamic code loading facilities ?
-	AC_CHECK_LIB(elf, elf_memory,, [have_valid_gordon=no])
-
-	AC_CHECK_LIB(spe2, spe_context_create,,[have_valid_gordon=no])
-	AC_CHECK_FUNC(spe_in_mbox_write, [], [have_valid_gordon=no])
-
-	PKG_PROG_PKG_CONFIG
-	if test -d "$gordon_dir"; then
-		PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:$gordon_dir"
-	fi
-	AC_SUBST(PKG_CONFIG_PATH)
-	PKG_CHECK_MODULES([GORDON], [libgordon], [], have_valid_gordon=no)
-
-	CPPFLAGS="${CPPFLAGS} ${GORDON_CFLAGS}"
-	LIBS="${LIBS} ${GORDON_LIBS}"
-
-	# AC_CHECK_FUNC(gordon_init, [gordon], [have_valid_gordon=no])
-
-	# in case Gordon was explicitely required, but is not available, this is an error
-	if test x$enable_gordon = xyes -a x$have_valid_gordon = xno; then
-		AC_MSG_ERROR([cannot find Gordon])
-	fi
-
+# warning: Cell driver has been removed from configure, but as the
+# source code is still available, we need to define the minimum
+# requirements to compile
+AC_DEFINE_UNQUOTED(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
 
 
-	# now we enable Gordon if and only if a proper setup is available
-	enable_gordon=$have_valid_gordon
-	AC_DEFINE(STARPU_MAXGORDONDEVS, [1], [maximum number of GORDON devices])
-fi
-
-AC_MSG_CHECKING(whether GORDON should be used)
-AC_MSG_RESULT($enable_gordon)
-AC_SUBST(STARPU_USE_GORDON, $enable_gordon)
-AM_CONDITIONAL(STARPU_USE_GORDON, test x$enable_gordon = xyes)
-
-if test x$enable_gordon = xyes; then
-	AC_DEFINE(STARPU_USE_GORDON, [1], [Cell support is enabled])
-	GORDON_REQUIRES=gordon
-fi
-AC_SUBST(GORDON_REQUIRES)
+###############################################################################
+#                                                                             #
+#                                 Drivers                                     #
+#                                                                             #
+###############################################################################
 
 
 AC_MSG_CHECKING(whether blocking drivers should be disabled)
 AC_MSG_CHECKING(whether blocking drivers should be disabled)
 AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--disable-blocking-drivers], [disable blocking drivers])],
 AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--disable-blocking-drivers], [disable blocking drivers])],
-				enable_blocking=$enableval, enable_blocking=$enable_gordon)
+				enable_blocking=$enableval, enable_blocking=no)
 AC_MSG_RESULT($enable_blocking)
 AC_MSG_RESULT($enable_blocking)
 
 
 if test x$enable_blocking = xno; then
 if test x$enable_blocking = xno; then
@@ -1015,8 +969,7 @@ AC_MSG_RESULT($nmaxbuffers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 		[how many buffers can be manipulated per task])
 		[how many buffers can be manipulated per task])
 
 
-# We have one memory node shared by all CPU workers, one node per GPU, and
-# currently the Cell driver is using the same memory node as the CPU.
+# We have one memory node shared by all CPU workers, one node per GPU
 maxnodes=1
 maxnodes=1
 if test x$enable_cuda = xyes ; then
 if test x$enable_cuda = xyes ; then
 	# we could have used nmaxcudadev + 1, but this would certainly give an
 	# we could have used nmaxcudadev + 1, but this would certainly give an
@@ -1406,12 +1359,6 @@ AC_ARG_ENABLE([socl],
 
 
 AC_MSG_CHECKING(for SOCL)
 AC_MSG_CHECKING(for SOCL)
 
 
-if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
-    if test "$have_valid_opencl" = "no" ; then
-	STARPU_LOOK_FOR_OPENCL()
-    fi
-fi
-
 # in case SOCL was explicitely required, but is not available, this is an error
 # in case SOCL was explicitely required, but is not available, this is an error
 if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
 if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
     AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
     AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
@@ -1865,7 +1812,6 @@ AC_MSG_NOTICE([
 	CPUs   enabled: $enable_cpu
 	CPUs   enabled: $enable_cpu
 	CUDA   enabled: $enable_cuda
 	CUDA   enabled: $enable_cuda
 	OpenCL enabled: $enable_opencl
 	OpenCL enabled: $enable_opencl
-	Cell   enabled: $enable_gordon
 
 
 	Compile-time limits
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,
 	(change these with --enable-maxcpus, --enable-maxcudadev,

+ 1 - 7
doc/chapters/advanced-api.texi

@@ -61,9 +61,6 @@ Compare the data size of two interfaces.
 @item @code{ void (*display)(starpu_data_handle_t handle, FILE *f)}
 @item @code{ void (*display)(starpu_data_handle_t handle, FILE *f)}
 Dump the sizes of a handle to a file.
 Dump the sizes of a handle to a file.
 
 
-@item @code{ int (*convert_to_gordon)(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)}
-Convert the data size to the spu size format. If no SPUs are used, this field can be seto NULL.
-
 @item @code{enum starpu_data_interface_id interfaceid}
 @item @code{enum starpu_data_interface_id interfaceid}
 An identifier that is unique to each interface.
 An identifier that is unique to each interface.
 
 
@@ -534,9 +531,6 @@ Actual number of CUDA workers used by StarPU.
 @item @code{unsigned nopenclgpus}
 @item @code{unsigned nopenclgpus}
 Actual number of OpenCL workers used by StarPU.
 Actual number of OpenCL workers used by StarPU.
 
 
-@item @code{unsigned ngordon_spus}
-Actual number of Gordon workers used by StarPU.
-
 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
 Indicates the successive cpu identifier that should be used to bind the
 Indicates the successive cpu identifier that should be used to bind the
 workers. It is either filled according to the user's explicit
 workers. It is either filled according to the user's explicit
@@ -699,7 +693,7 @@ assign tasks to the different workers.
 Remove all available tasks from the scheduler (tasks are chained by the means
 Remove all available tasks from the scheduler (tasks are chained by the means
 of the prev and next fields of the starpu_task structure). The mutex associated
 of the prev and next fields of the starpu_task structure). The mutex associated
 to the worker is already taken when this method is called. This is currently
 to the worker is already taken when this method is called. This is currently
-only used by the Gordon driver.
+not used.
 
 
 @item @code{void (*pre_exec_hook)(struct starpu_task *)} (optional)
 @item @code{void (*pre_exec_hook)(struct starpu_task *)} (optional)
 This method is called every time a task is starting.
 This method is called every time a task is starting.

+ 21 - 62
doc/chapters/basic-api.texi

@@ -20,7 +20,6 @@
 * Profiling API::
 * Profiling API::
 * CUDA extensions::
 * CUDA extensions::
 * OpenCL extensions::
 * OpenCL extensions::
-* Cell extensions::
 * Miscellaneous helpers::
 * Miscellaneous helpers::
 @end menu
 @end menu
 
 
@@ -94,10 +93,6 @@ be specified with the @code{STARPU_NCUDA} environment variable.
 This is the number of OpenCL devices that StarPU can use. This can
 This is the number of OpenCL devices that StarPU can use. This can
 also be specified with the @code{STARPU_NOPENCL} environment variable.
 also be specified with the @code{STARPU_NOPENCL} environment variable.
 
 
-@item @code{int nspus} (default = -1)
-This is the number of Cell SPUs that StarPU can use. This can also be
-specified with the @code{STARPU_NGORDON} environment variable.
-
 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
 If this flag is set, the @code{workers_bindid} array indicates where the
 If this flag is set, the @code{workers_bindid} array indicates where the
 different workers are bound, otherwise StarPU automatically selects where to
 different workers are bound, otherwise StarPU automatically selects where to
@@ -243,7 +238,6 @@ The different values are:
 @item @code{STARPU_CPU_WORKER}
 @item @code{STARPU_CPU_WORKER}
 @item @code{STARPU_CUDA_WORKER}
 @item @code{STARPU_CUDA_WORKER}
 @item @code{STARPU_OPENCL_WORKER}
 @item @code{STARPU_OPENCL_WORKER}
-@item @code{STARPU_GORDON_WORKER}
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -273,10 +267,6 @@ This function returns the number of OpenCL devices controlled by StarPU. The ret
 value should be at most @code{STARPU_MAXOPENCLDEVS}.
 value should be at most @code{STARPU_MAXOPENCLDEVS}.
 @end deftypefun
 @end deftypefun
 
 
-@deftypefun unsigned starpu_spu_worker_get_count (void)
-This function returns the number of Cell SPUs controlled by StarPU.
-@end deftypefun
-
 @deftypefun int starpu_worker_get_id (void)
 @deftypefun int starpu_worker_get_id (void)
 This function returns the identifier of the current worker, i.e the one associated to the calling
 This function returns the identifier of the current worker, i.e the one associated to the calling
 thread. The returned value is either -1 if the current context is not a StarPU
 thread. The returned value is either -1 if the current context is not a StarPU
@@ -311,9 +301,8 @@ This function returns the type of processing unit associated to a
 worker. The worker identifier is a value returned by the
 worker. The worker identifier is a value returned by the
 @code{starpu_worker_get_id} function). The returned value
 @code{starpu_worker_get_id} function). The returned value
 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
 indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
-core, @code{STARPU_CUDA_WORKER} for a CUDA device,
-@code{STARPU_OPENCL_WORKER} for a OpenCL device, and
-@code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
+core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
+@code{STARPU_OPENCL_WORKER} for a OpenCL device. The value returned for an invalid
 identifier is unspecified.
 identifier is unspecified.
 @end deftypefun
 @end deftypefun
 
 
@@ -339,7 +328,6 @@ todo
 @item @code{STARPU_CPU_RAM}
 @item @code{STARPU_CPU_RAM}
 @item @code{STARPU_CUDA_RAM}
 @item @code{STARPU_CUDA_RAM}
 @item @code{STARPU_OPENCL_RAM}
 @item @code{STARPU_OPENCL_RAM}
-@item @code{STARPU_SPU_LS}
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -1150,7 +1138,7 @@ Return the size of the elements registered into the matrix designated by @var{in
 Return a pointer to the column array of the matrix designated by
 Return a pointer to the column array of the matrix designated by
 @var{interface}.
 @var{interface}.
 @end defmac
 @end defmac
-@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE({void *}@var{interface})
+@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE ({void *}@var{interface})
 Return a device handle for the column array of the matrix designated by
 Return a device handle for the column array of the matrix designated by
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 used in addition to this.
 used in addition to this.
@@ -1158,7 +1146,7 @@ used in addition to this.
 @defmac STARPU_COO_GET_ROWS (interface)
 @defmac STARPU_COO_GET_ROWS (interface)
 Return a pointer to the rows array of the matrix designated by @var{interface}.
 Return a pointer to the rows array of the matrix designated by @var{interface}.
 @end defmac
 @end defmac
-@defmac STARPU_COO_GET_ROWS_DEV_HANDLE({void *}@var{interface})
+@defmac STARPU_COO_GET_ROWS_DEV_HANDLE ({void *}@var{interface})
 Return a device handle for the row array of the matrix designated by
 Return a device handle for the row array of the matrix designated by
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 used in addition to this.
 used in addition to this.
@@ -1167,12 +1155,12 @@ used in addition to this.
 Return a pointer to the values array of the matrix designated by
 Return a pointer to the values array of the matrix designated by
 @var{interface}.
 @var{interface}.
 @end defmac
 @end defmac
-@defmac STARPU_COO_GET_VALUES_DEV_HANDLE({void *}@var{interface})
+@defmac STARPU_COO_GET_VALUES_DEV_HANDLE ({void *}@var{interface})
 Return a device handle for the value array of the matrix designated by
 Return a device handle for the value array of the matrix designated by
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 @var{interface}, to be used on OpenCL. The offset documented below has to be
 used in addition to this.
 used in addition to this.
 @end defmac
 @end defmac
-@defmac STARPU_COO_GET_OFFSET({void *}@var{itnerface})
+@defmac STARPU_COO_GET_OFFSET ({void *}@var{itnerface})
 Return the offset in the arrays of the COO matrix designated by @var{interface}.
 Return the offset in the arrays of the COO matrix designated by @var{interface}.
 @end defmac
 @end defmac
 @defmac STARPU_COO_GET_NX (interface)
 @defmac STARPU_COO_GET_NX (interface)
@@ -1493,18 +1481,6 @@ starpu_codelet} to specify the codelet may be executed on a CUDA
 processing unit.
 processing unit.
 @end defmac
 @end defmac
 
 
-@defmac STARPU_SPU
-This macro is used when setting the field @code{where} of a @code{struct
-starpu_codelet} to specify the codelet may be executed on a SPU
-processing unit.
-@end defmac
-
-@defmac STARPU_GORDON
-This macro is used when setting the field @code{where} of a @code{struct
-starpu_codelet} to specify the codelet may be executed on a Cell
-processing unit.
-@end defmac
-
 @defmac STARPU_OPENCL
 @defmac STARPU_OPENCL
 This macro is used when setting the field @code{where} of a @code{struct
 This macro is used when setting the field @code{where} of a @code{struct
 starpu_codelet} to specify the codelet may be executed on a OpenCL
 starpu_codelet} to specify the codelet may be executed on a OpenCL
@@ -1542,12 +1518,12 @@ e.g. static storage case.
 @item @code{uint32_t where} (optional)
 @item @code{uint32_t where} (optional)
 Indicates which types of processing units are able to execute the
 Indicates which types of processing units are able to execute the
 codelet. The different values
 codelet. The different values
-@code{STARPU_CPU}, @code{STARPU_CUDA}, @code{STARPU_SPU},
-@code{STARPU_GORDON}, @code{STARPU_OPENCL} can be combined to specify
+@code{STARPU_CPU}, @code{STARPU_CUDA}, 
+@code{STARPU_OPENCL} can be combined to specify
 on which types of processing units the codelet can be executed.
 on which types of processing units the codelet can be executed.
 @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
 @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
-implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
-indicates that it is only available on Cell SPUs. If the field is
+implemented for both CPU cores and CUDA devices while @code{STARPU_OPENCL}
+indicates that it is only available on OpenCL devices. If the field is
 unset, its value will be automatically set based on the availability
 unset, its value will be automatically set based on the availability
 of the @code{XXX_funcs} fields defined below.
 of the @code{XXX_funcs} fields defined below.
 
 
@@ -1607,17 +1583,6 @@ If the @code{where} field is set, then the @code{opencl_funcs} field
 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
 field, it must be non-null otherwise.
 field, it must be non-null otherwise.
 
 
-@item @code{uint8_t gordon_func} (optional)
-This field has been made deprecated. One should use instead the
-@code{gordon_funcs} field.
-
-@item @code{uint8_t gordon_funcs[STARPU_MAXIMPLEMENTATIONS]} (optional)
-Is an array of index of the Cell SPU implementations of the codelet within the
-Gordon library.
-It must be terminated by a NULL value.
-See Gordon documentation for more details on how to register a kernel and
-retrieve its index.
-
 @item @code{unsigned nbuffers}
 @item @code{unsigned nbuffers}
 Specifies the number of arguments taken by the codelet. These arguments are
 Specifies the number of arguments taken by the codelet. These arguments are
 managed by the DSM and are accessed from the @code{void *buffers[]}
 managed by the DSM and are accessed from the @code{void *buffers[]}
@@ -1726,17 +1691,17 @@ by the DSM.
 @item @code{void *cl_arg} (optional; default: @code{NULL})
 @item @code{void *cl_arg} (optional; default: @code{NULL})
 This pointer is passed to the codelet through the second argument
 This pointer is passed to the codelet through the second argument
 of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
 of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
-In the specific case of the Cell processor, see the @code{cl_arg_size}
-argument.
-
-@item @code{size_t cl_arg_size} (optional, Cell-specific)
-In the case of the Cell processor, the @code{cl_arg} pointer is not directly
-given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
-the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
-at address @code{cl_arg}. In this case, the argument given to the SPU codelet
-is therefore not the @code{cl_arg} pointer, but the address of the buffer in
-local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
-codelets, where the @code{cl_arg} pointer is given as such.
+
+@item @code{size_t cl_arg_size} (optional)
+For some specific drivers, the @code{cl_arg} pointer cannot not be directly
+given to the driver function. A buffer of size @code{cl_arg_size}
+needs to be allocated on the driver. This buffer is then filled with
+the @code{cl_arg_size} bytes starting at address @code{cl_arg}. In
+this case, the argument given to the codelet is therefore not the
+@code{cl_arg} pointer, but the address of the buffer in local store
+(LS) instead.
+This field is ignored for CPU, CUDA and OpenCL codelets, where the
+@code{cl_arg} pointer is given as such.
 
 
 @item @code{void (*callback_func)(void *)} (optional) (default: @code{NULL})
 @item @code{void (*callback_func)(void *)} (optional) (default: @code{NULL})
 This is a function pointer of prototype @code{void (*f)(void *)} which
 This is a function pointer of prototype @code{void (*f)(void *)} which
@@ -2089,7 +2054,6 @@ OpenCL types range within STARPU_OPENCL_DEFAULT (GPU number 0), STARPU_OPENCL_DE
 @item @code{STARPU_CPU_DEFAULT}
 @item @code{STARPU_CPU_DEFAULT}
 @item @code{STARPU_CUDA_DEFAULT}
 @item @code{STARPU_CUDA_DEFAULT}
 @item @code{STARPU_OPENCL_DEFAULT}
 @item @code{STARPU_OPENCL_DEFAULT}
-@item @code{STARPU_GORDON_DEFAULT}
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -2742,11 +2706,6 @@ otherwise. The integer pointed to by @var{ret} is set to -EAGAIN if the asynchro
 was successful, or to 0 if event was NULL.
 was successful, or to 0 if event was NULL.
 @end deftypefun
 @end deftypefun
 
 
-@node Cell extensions
-@section Cell extensions
-
-nothing yet.
-
 @node Miscellaneous helpers
 @node Miscellaneous helpers
 @section Miscellaneous helpers
 @section Miscellaneous helpers
 
 

+ 1 - 1
doc/chapters/basic-examples.texi

@@ -161,7 +161,7 @@ struct starpu_codelet cl =
 
 
 A codelet is a structure that represents a computational kernel. Such a codelet
 A codelet is a structure that represents a computational kernel. Such a codelet
 may contain an implementation of the same kernel on different architectures
 may contain an implementation of the same kernel on different architectures
-(e.g. CUDA, Cell's SPU, x86, ...). For compatibility, make sure that the whole
+(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
 structure is initialized to zero, either by using memset, or by letting the
 structure is initialized to zero, either by using memset, or by letting the
 compiler implicitly do it as examplified above.
 compiler implicitly do it as examplified above.
 
 

+ 1 - 11
doc/chapters/configuration.texi

@@ -121,13 +121,6 @@ Search for an OpenCL library under @var{dir}, which should notably
 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
 
 
-@item --enable-gordon
-Enable the use of the Gordon runtime for Cell SPUs.
-@c TODO: rather default to enabled when detected
-
-@item --with-gordon-dir=@var{prefix}
-Search for the Gordon SDK under @var{prefix}.
-
 @item --enable-maximplementations=@var{count}
 @item --enable-maximplementations=@var{count}
 Allow for at most @var{count} codelet implementations for the same
 Allow for at most @var{count} codelet implementations for the same
 target device.  This information is then available as the
 target device.  This information is then available as the
@@ -292,9 +285,6 @@ create as many CUDA workers as there are CUDA devices.
 @item @code{STARPU_NOPENCL}
 @item @code{STARPU_NOPENCL}
 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
 
 
-@item @code{STARPU_NGORDON}
-Specify the number of SPUs that StarPU can use.
-
 @item @code{STARPU_WORKERS_NOBIND}
 @item @code{STARPU_WORKERS_NOBIND}
 Setting it to non-zero will prevent StarPU from binding its threads to
 Setting it to non-zero will prevent StarPU from binding its threads to
 CPUs. This is for instance useful when running the testsuite in parallel.
 CPUs. This is for instance useful when running the testsuite in parallel.
@@ -309,7 +299,7 @@ determined by the OS, or provided by the @code{hwloc} library in case it is
 available.
 available.
 
 
 Note that the first workers correspond to the CUDA workers, then come the
 Note that the first workers correspond to the CUDA workers, then come the
-OpenCL and the SPU, and finally the CPU workers. For example if
+OpenCL workers, and finally the CPU workers. For example if
 we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
 we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and

+ 2 - 2
doc/chapters/introduction.texi

@@ -83,7 +83,7 @@ The remainder of this section describes the main concepts used in StarPU.
 @cindex codelet
 @cindex codelet
 One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
 One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
 computational kernel that can possibly be implemented on multiple architectures
 computational kernel that can possibly be implemented on multiple architectures
-such as a CPU, a CUDA device or a Cell's SPU.
+such as a CPU, a CUDA device or an OpenCL device.
 
 
 @c TODO insert illustration f: f_spu, f_cpu, ...
 @c TODO insert illustration f: f_spu, f_cpu, ...
 
 
@@ -150,7 +150,7 @@ A @b{worker} execute tasks. There is typically one per CPU computation core and
 one per accelerator (for which a whole CPU core is dedicated).
 one per accelerator (for which a whole CPU core is dedicated).
 
 
 A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
 A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
-OpenCL and Gordon drivers. They usually start several workers to actually drive
+and OpenCL drivers. They usually start several workers to actually drive
 them.
 them.
 
 
 A @b{performance model} is a (dynamic or static) model of the performance of a
 A @b{performance model} is a (dynamic or static) model of the performance of a

+ 0 - 21
examples/Makefile.am

@@ -75,27 +75,6 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
 
 endif
 endif
 
 
-if STARPU_USE_GORDON
-
-SPU_CC ?= spu-gcc
-SPU_LD ?= spu-ld
-
-SPULDFLAGS =
-SPULIBS = -lblas #-lc -lgloss -lc
-
-.c.spuo:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_CC) -c -fpic $< -o $@
-
-.spuo.spuelf:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
-
-BUILT_SOURCES +=				\
-	gordon/null_kernel_gordon.spuelf
-
-endif
-
 if STARPU_HAVE_ICC
 if STARPU_HAVE_ICC
 .icc.o:
 .icc.o:
 	$(ICC) -x c $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
 	$(ICC) -x c $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \

+ 0 - 4
src/Makefile.am

@@ -227,10 +227,6 @@ endif
 
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
 
 
-if STARPU_USE_GORDON
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/gordon/driver_gordon.c
-endif
-
 if STARPU_USE_OPENCL
 if STARPU_USE_OPENCL
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl_utils.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl_utils.c

+ 6 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -94,6 +94,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	size_t size = SIZE;
 	size_t size = SIZE;
 
 
 	/* Initialize CUDA context on the device */
 	/* Initialize CUDA context on the device */
+	/* We do not need to enable OpenGL interoperability at this point,
+	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(dev);
 	cudaSetDevice(dev);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
@@ -189,6 +191,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
 
 
 	/* Initialize CUDA context on the source */
 	/* Initialize CUDA context on the source */
+	/* We do not need to enable OpenGL interoperability at this point,
+	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(src);
 	cudaSetDevice(src);
 
 
 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
@@ -207,6 +211,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	cudaMemset(s_buffer, 0, size);
 	cudaMemset(s_buffer, 0, size);
 
 
 	/* Initialize CUDA context on the destination */
 	/* Initialize CUDA context on the destination */
+	/* We do not need to enable OpenGL interoperability at this point,
+	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(dst);
 	cudaSetDevice(dst);
 
 
 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {

+ 23 - 2
src/datawizard/interfaces/coo_interface.c

@@ -133,6 +133,27 @@ copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
 				    dst_interface, dst_node,
 				    dst_interface, dst_node,
 				    stream, cudaMemcpyDeviceToHost);
 				    stream, cudaMemcpyDeviceToHost);
 }
 }
+
+static int
+copy_cuda_to_cuda(void *src_interface, unsigned src_node,
+		  void *dst_interface, unsigned dst_node)
+{
+	return copy_cuda_async_sync(src_interface, src_node,
+				    dst_interface, dst_node,
+				    NULL, cudaMemcpyDeviceToDevice);
+}
+
+#ifdef NO_STRIDE
+static int
+copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
+			void *dst_interface, unsigned dst_node,
+			cudaStream_t stream)
+{
+	return copy_cuda_async_sync(src_interface, src_node,
+				    dst_interface, dst_node,
+				    stream, cudaMemcpyDeviceToDevice);
+}
+#endif /* !NO_STRIDE */
 #endif /* !STARPU_USE_CUDA */
 #endif /* !STARPU_USE_CUDA */
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
@@ -281,9 +302,9 @@ static struct starpu_data_copy_methods coo_copy_data_methods =
 	.cuda_to_ram         = copy_cuda_to_ram,
 	.cuda_to_ram         = copy_cuda_to_ram,
 	.ram_to_cuda_async   = copy_ram_to_cuda_async,
 	.ram_to_cuda_async   = copy_ram_to_cuda_async,
 	.cuda_to_ram_async   = copy_cuda_to_ram_async,
 	.cuda_to_ram_async   = copy_cuda_to_ram_async,
-	.cuda_to_cuda        = NULL, /* TODO */
+	.cuda_to_cuda        = copy_cuda_to_cuda,
 #ifdef NO_STRIDE
 #ifdef NO_STRIDE
-	.cuda_to_cuda_async  = NULL, /* TODO */
+	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
 #endif
 #endif
 #endif /* !STARPU_USE_CUDA */
 #endif /* !STARPU_USE_CUDA */
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL

+ 48 - 5
src/drivers/cuda/driver_cuda.c

@@ -507,21 +507,64 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 	STARPU_ABORT();
 	STARPU_ABORT();
 }
 }
 
 
-int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
+int
+starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
+			    void *dst_ptr, unsigned dst_node,
+			    size_t ssize, cudaStream_t stream,
+			    enum cudaMemcpyKind kind)
 {
 {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	int peer_copy = 0;
+	int src_dev = -1, dst_dev = -1;
+#endif
 	cudaError_t cures = 0;
 	cudaError_t cures = 0;
 
 
+	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
+	{
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		peer_copy = 1;
+		src_dev = _starpu_memory_node_to_devid(src_node);
+		dst_dev = _starpu_memory_node_to_devid(dst_node);
+#else
+		STARPU_ABORT();
+#endif
+	}
+
 	if (stream)
 	if (stream)
 	{
 	{
-	     _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-	     cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
-	     _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		if (peer_copy)
+		{
+			cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
+						    (char *) src_ptr, src_dev,
+						    ssize, stream);
+		}
+		else
+#endif
+		{
+			cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
+		}
+		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 	}
 	}
+
 	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
 	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
 	if (stream == NULL || cures)
 	if (stream == NULL || cures)
 	{
 	{
 		/* do it in a synchronous fashion */
 		/* do it in a synchronous fashion */
-		cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		if (peer_copy)
+		{
+			cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,
+					       (char *) src_ptr, src_dev,
+					       ssize);
+		}
+		else
+#endif
+		{
+			cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
+		}
+		
 
 
 		if (STARPU_UNLIKELY(cures))
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 			STARPU_CUDA_REPORT_ERROR(cures);

+ 85 - 4
src/sched_policies/fifo_queues.c

@@ -52,14 +52,68 @@ int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo)
 	return fifo->ntasks == 0;
 	return fifo->ntasks == 0;
 }
 }
 
 
-/* TODO: revert front/back? */
-int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
+int
+_starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue,
+			      pthread_mutex_t *sched_mutex,
+			      pthread_cond_t *sched_cond,
+			      struct starpu_task *task)
 {
 {
+	struct starpu_task_list *list = &fifo_queue->taskq;
+
 	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 
 
 	_STARPU_TRACE_JOB_PUSH(task, 0);
 	_STARPU_TRACE_JOB_PUSH(task, 0);
-	/* TODO: if prio, put at back */
-	starpu_task_list_push_front(&fifo_queue->taskq, task);
+
+	if (list->head == NULL)
+	{
+		list->head = task;
+		list->tail = task;
+		task->prev = NULL;
+		task->next = NULL;
+	}
+	else
+	{
+		struct starpu_task *current = list->head;
+		struct starpu_task *prev = NULL;
+
+		while (current)
+		{
+			if (current->priority >= task->priority)
+				break;
+
+			prev = current;
+			current = current->next;
+		}
+
+		if (prev == NULL)
+		{
+			/* Insert at the front of the list */
+			list->head->prev = task;
+			task->prev = NULL;
+			task->next = list->head;
+			list->head = task;
+		}
+		else
+		{
+			if (current)
+			{
+				/* Insert between prev and current */
+				task->prev = prev;
+				prev->next = task;
+				task->next = current;
+				current->prev = task;
+			}
+			else
+			{
+				/* Insert at the tail of the list */
+				list->tail->next = task;
+				task->next = NULL;
+				task->prev = list->tail;
+				list->tail = task;
+			}
+		}
+	}
+
 	fifo_queue->ntasks++;
 	fifo_queue->ntasks++;
 	fifo_queue->nprocessed++;
 	fifo_queue->nprocessed++;
 
 
@@ -69,6 +123,33 @@ int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_
 	return 0;
 	return 0;
 }
 }
 
 
+/* TODO: revert front/back? */
+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
+{
+
+	if (task->priority > 0)
+	{
+		_STARPU_TRACE_JOB_PUSH(task, 1);
+		_starpu_fifo_push_sorted_task(fifo_queue, sched_mutex,
+					      sched_cond, task);
+	}
+	else
+	{
+		_STARPU_TRACE_JOB_PUSH(task, 0);
+
+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+		starpu_task_list_push_front(&fifo_queue->taskq, task);
+
+		fifo_queue->ntasks++;
+		fifo_queue->nprocessed++;
+
+		_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	}
+
+	return 0;
+}
+
 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
 {
 {
 	struct starpu_task *task;
 	struct starpu_task *task;

+ 5 - 0
src/sched_policies/fifo_queues.h

@@ -44,6 +44,11 @@ void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo);
 
 
 int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
 int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
 
 
+int _starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue,
+				  pthread_mutex_t *sched_mutex,
+				  pthread_cond_t *sched_cond,
+				  struct starpu_task *task);
+
 int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
 int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
 
 
 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);
 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);

+ 0 - 31
tests/Makefile.am

@@ -77,24 +77,6 @@ NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src
 
 
 endif
 endif
 
 
-if STARPU_USE_GORDON
-
-SPU_CC ?= spu-gcc
-SPU_LD ?= spu-ld
-
-.c.spuo:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_CC) -c -fpic $< -o $@
-
-.spuo.spuelf:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_LD) $< -o $@
-
-#BUILT_SOURCES +=
-#	microbenchs/null_kernel_gordon.spuelf
-
-endif
-
 testbindir = $(libdir)/starpu/tests
 testbindir = $(libdir)/starpu/tests
 
 
 #####################################
 #####################################
@@ -120,10 +102,7 @@ if STARPU_COVERAGE_ENABLED
 TESTS	+=	coverage/coverage.sh
 TESTS	+=	coverage/coverage.sh
 endif
 endif
 
 
-starpu_machine_display_SOURCES	=	../tools/starpu_machine_display.c
-
 noinst_PROGRAMS =				\
 noinst_PROGRAMS =				\
-	starpu_machine_display			\
 	main/deprecated_func			\
 	main/deprecated_func			\
 	main/deprecated_buffer			\
 	main/deprecated_buffer			\
 	main/driver_api/init_run_deinit         \
 	main/driver_api/init_run_deinit         \
@@ -385,16 +364,6 @@ datawizard_wt_host_SOURCES =			\
 datawizard_wt_broadcast_SOURCES =		\
 datawizard_wt_broadcast_SOURCES =		\
 	datawizard/wt_broadcast.c
 	datawizard/wt_broadcast.c
 
 
-if STARPU_USE_GORDON
-datawizard_sync_and_notify_data_SOURCES +=	\
-	datawizard/sync_and_notify_data_gordon_kernels.c
-datawizard_sync_and_notify_data_implicit_SOURCES +=	\
-	datawizard/sync_and_notify_data_gordon_kernels.c
-BUILT_SOURCES += 						\
-	datawizard/sync_and_notify_data_gordon_kernels.spuelf	\
-	microbenchs/null_kernel_gordon.spuelf
-endif
-
 ###################
 ###################
 # Block interface #
 # Block interface #
 ###################
 ###################