Browse Source

merge trunk -r 6400:6500

Nathalie Furmento 12 years ago
parent
commit
22a0e63018
67 changed files with 1550 additions and 538 deletions
  1. 6 0
      ChangeLog
  2. 22 8
      configure.ac
  3. 20 12
      doc/chapters/advanced-examples.texi
  4. 72 5
      doc/chapters/basic-api.texi
  5. 0 4
      doc/chapters/configuration.texi
  6. 11 8
      doc/chapters/mpi-support.texi
  7. 1 1
      doc/chapters/perf-feedback.texi
  8. 15 0
      examples/Makefile.am
  9. 2 2
      examples/common/blas.c
  10. 2 2
      examples/common/blas.h
  11. 1 0
      examples/filters/fvector.c
  12. 130 0
      examples/gl_interop/gl_interop.c
  13. 57 29
      examples/interface/complex_interface.c
  14. 7 7
      examples/matvecmult/matvecmult.c
  15. 4 15
      examples/scheduler/dummy_sched.c
  16. 2 1
      gcc-plugin/src/Makefile.am
  17. 11 7
      gcc-plugin/src/c-expr.y
  18. 17 4
      gcc-plugin/src/starpu.c
  19. 6 1
      gcc-plugin/tests/register.c
  20. 48 10
      include/starpu.h
  21. 6 2
      include/starpu_cuda.h
  22. 4 5
      include/starpu_task.h
  23. 10 0
      src/core/dependencies/tags.c
  24. 7 7
      src/core/perfmodel/perfmodel_bus.c
  25. 27 1
      src/core/task.c
  26. 112 3
      src/core/workers.c
  27. 4 1
      src/core/workers.h
  28. 1 2
      src/datawizard/copy_driver.c
  29. 2 22
      src/datawizard/interfaces/block_interface.c
  30. 15 83
      src/datawizard/interfaces/csr_interface.c
  31. 2 4
      src/datawizard/interfaces/matrix_interface.c
  32. 16 57
      src/datawizard/interfaces/variable_interface.c
  33. 15 53
      src/datawizard/interfaces/vector_interface.c
  34. 2 4
      src/datawizard/memalloc.c
  35. 0 1
      src/drivers/cpu/driver_cpu.c
  36. 81 10
      src/drivers/cuda/driver_cuda.c
  37. 1 1
      src/drivers/gordon/driver_gordon.c
  38. 39 2
      src/drivers/opencl/driver_opencl.c
  39. 0 2
      src/drivers/opencl/driver_opencl_utils.c
  40. 4 1
      tests/Makefile.am
  41. 3 3
      tests/datawizard/gpu_register.c
  42. 5 6
      tests/datawizard/interfaces/bcsr/bcsr_interface.c
  43. 4 6
      tests/datawizard/interfaces/block/block_interface.c
  44. 5 6
      tests/datawizard/interfaces/csr/csr_interface.c
  45. 4 6
      tests/datawizard/interfaces/matrix/matrix_interface.c
  46. 5 6
      tests/datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl.c
  47. 5 6
      tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release.c
  48. 5 6
      tests/datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion.c
  49. 4 6
      tests/datawizard/interfaces/multiformat/multiformat_interface.c
  50. 5 6
      tests/datawizard/interfaces/variable/variable_interface.c
  51. 4 6
      tests/datawizard/interfaces/vector/test_vector_interface.c
  52. 4 6
      tests/datawizard/interfaces/void/void_interface.c
  53. 2 1
      tests/datawizard/write_only_tmp_buffer.c
  54. 4 2
      tests/errorcheck/invalid_tasks.c
  55. 7 13
      tests/errorcheck/starpu_init_noworker.c
  56. 1 1
      tests/experiments/latency/cuda_latency.c
  57. 7 17
      tests/main/starpu_task_wait_for_all.c
  58. 173 0
      tests/main/subgraph_repeat_regenerate_tag.c
  59. 158 0
      tests/main/subgraph_repeat_tag.c
  60. 115 0
      tests/main/submit.c
  61. 6 16
      tests/microbenchs/async_tasks_overhead.c
  62. 224 0
      tests/microbenchs/matrix_as_vector.c
  63. 5 13
      tests/parallel_tasks/explicit_combined_worker.c
  64. 4 12
      tests/parallel_tasks/parallel_kernels.c
  65. 4 12
      tests/parallel_tasks/parallel_kernels_spmd.c
  66. 2 2
      tools/starpu_perfmodel_plot.c
  67. 3 3
      tools/starpu_workers_activity.in

+ 6 - 0
ChangeLog

@@ -14,6 +14,12 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
+StarPU 1.1.0 (svn revision xxxx)
+==============================================
+
+New features:
+  * OpenGL interoperability support.
+
 StarPU 1.0.0 (svn revision 6306)
 ==============================================
 The extensions-again release

+ 22 - 8
configure.ac

@@ -16,7 +16,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AC_INIT([StarPU],1.0.0, [starpu-devel@lists.gforge.inria.fr], starpu)
+AC_INIT([StarPU],1.1.0, [starpu-devel@lists.gforge.inria.fr], starpu)
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_AUX_DIR([build-aux])
 
@@ -429,14 +429,30 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
     if test "$have_valid_cuda" = "yes" ; then
         SAVED_CPPFLAGS="${CPPFLAGS}"
         CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
+	SAVED_LDFLAGS="${LDFLAGS}"
+	LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS} -lcuda"
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
 		[[#include <cuda.h>]],
 		[[]]
-		),
-	    [have_valid_cuda="yes"],
-	    [have_valid_cuda="no"]
-	])
+		)],
+	    [
+	      AC_RUN_IFELSE([AC_LANG_PROGRAM(
+	        [[#include <cuda.h>]],
+		[[]]
+		)],
+		[have_valid_cuda="yes"],
+		[
+	          AC_MSG_RESULT([CUDA found and can be compiled, but compiled application can not be run, is the CUDA path missing in LD_LIBRARY_PATH?])
+	          have_valid_cuda="no"
+		])
+	    ],
+	    [
+	    AC_MSG_ERROR([CUDA found, but cuda.h could not be compiled])
+	    have_valid_cuda="no"
+	    ]
+	)
         CPPFLAGS="${SAVED_CPPFLAGS}"
+	LDFLAGS="${SAVED_LDFLAGS}"
     fi
     AC_MSG_RESULT($have_valid_cuda)
 
@@ -873,9 +889,6 @@ if test x$use_fxt = xyes; then
 	else
 	    PKG_CHECK_MODULES([FXT],  [fxt])
 	fi
-
-	# if we use monotonic clocks, FxT uses -lrt
-	AC_CHECK_LIB(rt, clock_gettime,,AC_MSG_ERROR([cannot find clock_gettime]))
 fi
 
 AC_MSG_CHECKING(whether performance debugging should be enabled)
@@ -1345,6 +1358,7 @@ fi
 AC_MSG_CHECKING(whether OpenGL rendering is enabled)
 AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
 AC_MSG_RESULT($enable_opengl_render)
+AM_CONDITIONAL([HAVE_OPENGL], [test "x$enable_opengl_render" = xyes])
 
 AC_PATH_XTRA
 if test "x$no_x" != "xyes"; then

+ 20 - 12
doc/chapters/advanced-examples.texi

@@ -652,8 +652,8 @@ and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
 with partitioned vectors:
 
 @smallexample
-int dots(starpu_data_handle v1, starpu_data_handle v2,
-         starpu_data_handle s, unsigned nblocks)
+int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
+         starpu_data_handle_t s, unsigned nblocks)
 @{
     starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
     for (b = 0; b < nblocks; b++)
@@ -913,8 +913,22 @@ Graphical-oriented applications need to draw the result of their computations,
 typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
 interoperability permit to let CUDA directly work on the OpenGL buffers, making
 them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
-renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
-be given the CUDA pointer at registration, for instance:
+renderbuffer objects into CUDA.  CUDA however imposes some technical
+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
+to be the one that runs CUDA computations for that GPU.
+
+To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
+to @code{./configure} (TODO: make it dynamic), the interoperability mode has to
+be enabled by using the @code{cuda_opengl_interoperability} field of the
+@code{starpu_conf} structure, and the driver loop has to be run by
+the application, by using the @code{not_launched_drivers} field of
+@code{starpu_conf} to prevent StarPU from running it in a separate thread, and
+by using @code{starpu_run_driver} to run the loop. The @code{gl_interop} example
+shows how it articulates in a simple case, where rendering is done in task
+callbacks. TODO: provide glutIdleFunc alternative.
+
+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
+the CUDA pointer at registration, for instance:
 
 @cartouche
 @smallexample
@@ -922,21 +936,15 @@ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
                 break;
 
-cudaSetDevice(starpu_worker_get_devid(workerid));
 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
 starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
 
 starpu_insert_task(&cl, STARPU_RW, handle, 0);
-
-starpu_data_unregister(handle);
-
-cudaSetDevice(starpu_worker_get_devid(workerid));
-cudaGraphicsUnmapResources(1, &resource, 0);
-
-/* Now display it */
 @end smallexample
 @end cartouche
 
+and display it e.g. in the callback function.
+
 @node More examples
 @section More examples
 

+ 72 - 5
doc/chapters/basic-api.texi

@@ -36,9 +36,27 @@ Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
 indicates that no worker was available (so that StarPU was not initialized).
 @end deftypefun
 
+@deftp {Data Type} {struct starpu_driver}
+@table @asis
+@item @code{enum starpu_archtype type}
+The type of the driver. Only STARPU_CUDA_DRIVER and STARPU_OPENCL_DRIVER are
+currently supported.
+@item @code{union id}
+@deftp {Data type} {anonymous union}
+@table @asis
+@item @code{unsigned cuda_id}
+Should only be used if type is STARPU_CUDA_WORKER.
+@item @code{cl_device_id opencl_id}
+Should only be used if type is STARPU_OPENCL_WORKER.
+@end table
+@end deftp
+@end table
+@end deftp
+
+
 @deftp {Data Type} {struct starpu_conf}
 This structure is passed to the @code{starpu_init} function in order
-to configure StarPU.
+to configure StarPU. It has to be initialized with @code{starpu_conf_init}.
 When the default value is used, StarPU automatically selects the number of
 processing units and takes the default scheduling policy. The environment
 variables overwrite the equivalent parameters.
@@ -122,6 +140,22 @@ CPUs and accelerators. This can also be specified with the
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
+
+@item @code{int *cuda_opengl_interoperability} (default = NULL)
+This can be set to an array of CUDA device identifiers for which
+@code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
+size is specified by the @code{n_cuda_opengl_interoperability} field below
+
+@item @code{int *n_cuda_opengl_interoperability} (default = 0)
+This has to be set to the size of the array pointed to by the
+@code{cuda_opengl_interoperability} field.
+
+@item @code{struct starpu_driver *not_launched_drivers}
+The drivers that should not be launched by StarPU.
+
+@item @code{unsigned nnot_launched_drivers}
+The number of StarPU drivers that should not be launched by StarPU.
+
 @end table
 @end deftp
 
@@ -1414,9 +1448,10 @@ codelets, where the @code{cl_arg} pointer is given as such.
 @item @code{void (*callback_func)(void *)} (optional) (default: @code{NULL})
 This is a function pointer of prototype @code{void (*f)(void *)} which
 specifies a possible callback. If this pointer is non-null, the callback
-function is executed @emph{on the host} after the execution of the task. The
-callback is passed the value contained in the @code{callback_arg} field. No
-callback is executed if the field is set to @code{NULL}.
+function is executed @emph{on the host} after the execution of the task. Tasks
+which depend on it might already be executing. The callback is passed the
+value contained in the @code{callback_arg} field. No callback is executed if the
+field is set to @code{NULL}.
 
 @item @code{void *callback_arg} (optional) (default: @code{NULL})
 This is the pointer passed to the callback function. This field is ignored if
@@ -1582,6 +1617,10 @@ function for instance.
 In case of success, this function returns 0, a return value of @code{-ENODEV}
 means that there is no worker able to process this task (e.g. there is no GPU
 available and this task is only implemented for CUDA devices).
+
+starpu_task_submit() can be called from anywhere, including codelet
+functions and callbacks, provided that the @code{synchronous} field of the
+@code{starpu_task} structure is left to 0.
 @end deftypefun
 
 @deftypefun int starpu_task_wait_for_all (void)
@@ -1686,6 +1725,14 @@ This function is similar to @code{starpu_tag_wait} except that it blocks until
 terminated.
 @end deftypefun
 
+@deftypefun void starpu_tag_restart (unsigned @var{id})
+This function can be used to clear the "already notified" status
+of a tag which is not associated with a task. Before that, calling
+@code{starpu_tag_notify_from_apps} again will not notify the successors. After
+that, the next call to @code{starpu_tag_notify_from_apps} will notify the
+successors.
+@end deftypefun
+
 @deftypefun void starpu_tag_remove (starpu_tag_t @var{id})
 This function releases the resources associated to tag @var{id}. It can be
 called once the corresponding task has been executed and when there is
@@ -1697,7 +1744,10 @@ This function explicitly unlocks tag @var{id}. It may be useful in the
 case of applications which execute part of their computation outside StarPU
 tasks (e.g. third-party libraries).  It is also provided as a
 convenient tool for the programmer, for instance to entirely construct the task
-DAG before actually giving StarPU the opportunity to execute the tasks.
+DAG before actually giving StarPU the opportunity to execute the tasks. When
+called several times on the same tag, notification will be done only on first
+call, thus implementing "OR" dependencies, until the tag is restarted using
+@code{starpu_tag_restart}.
 @end deftypefun
 
 @node Implicit Data Dependencies
@@ -2106,6 +2156,23 @@ Calls starpu_cuda_report_error, passing the current function, file and line
 position.
 @end defmac
 
+@deftypefun int starpu_cuda_copy_async_sync ({void *}@var{src_ptr}, unsigned @var{src_node}, {void *}@var{dst_ptr}, unsigned @var{dst_node}, size_t @var{ssize}, cudaStream_t @var{stream}, {enum cudaMemcpyKind} @var{kind})
+Copy @var{ssize} bytes from the pointer @var{src_ptr} on
+@var{src_node} to the pointer @var{dst_ptr} on @var{dst_node}.
+The function first tries to copy the data asynchronous (unless
+@var{stream} is @code{NULL}. If the asynchronous copy fails or if
+@var{stream} is @code{NULL}, it copies the data synchronously.
+The function returns @code{-EAGAIN} if the asynchronous copy was
+successfull. It returns 0 if the synchronous copy was successful, or
+fails otherwise.
+@end deftypefun
+
+@deftypefun void starpu_cuda_set_device (int@var{devid})
+Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
+whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
+the @code{starpu_conf} structure.
+@end deftypefun
+
 @deftypefun void starpu_helper_cublas_init (void)
 This function initializes CUBLAS on every CUDA device.
 The CUBLAS library must be initialized prior to any CUBLAS call. Calling

+ 0 - 4
doc/chapters/configuration.texi

@@ -219,10 +219,6 @@ By default, it is disabled.
 * Misc::                        Miscellaneous and debug
 @end menu
 
-Note: the values given in @code{starpu_conf} structure passed when
-calling @code{starpu_init} will override the values of the environment
-variables.
-
 @node Workers
 @subsection Configuring workers
 

+ 11 - 8
doc/chapters/mpi-support.texi

@@ -140,17 +140,20 @@ communicator @var{comm}. On completion, @var{tag} is unlocked.
 @end deftypefun
 
 @deftypefun int starpu_mpi_isend_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{dest}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
-Posts @var{array_size} standard-mode, non blocking send of the data of
-data @var{data_handle[x]} to the node @var{dest[x]} using the message
-tag @code{mpi_tag[x]} within the communicator @var{comm[x]}. On
-completion of the all the requests, @var{tag} is unlocked.
+Posts @var{array_size} standard-mode, non blocking send. Each post
+sends the n-th data of the array @var{data_handle} to the n-th node of
+the array @var{dest}
+using the n-th message tag of the array @code{mpi_tag} within the n-th
+communicator of the array
+@var{comm}. On completion of the all the requests, @var{tag} is unlocked.
 @end deftypefun
 
 @deftypefun int starpu_mpi_irecv_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{source}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
-Posts @var{array_size} nonblocking receive in @var{data_handle[x]} from the
-node @var{source[x]} using the message tag @code{mpi_tag[x]} within the
-communicator @var{comm[x]}. On completion of the all the requests,
-@var{tag} is unlocked.
+Posts @var{array_size} nonblocking receive. Each post receives in the
+n-th data of the array @var{data_handle} from the n-th
+node of the array @var{source} using the n-th message tag of the array
+@code{mpi_tag} within the n-th communicator of the array @var{comm}.
+On completion of the all the requests, @var{tag} is unlocked.
 @end deftypefun
 
 @page

+ 1 - 1
doc/chapters/perf-feedback.texi

@@ -64,7 +64,7 @@ function.
 It it worth noting that the application may directly access this structure from
 the callback executed at the end of the task. The @code{starpu_task} structure
 associated to the callback currently being executed is indeed accessible with
-the @code{starpu_get_current_task()} function.
+the @code{starpu_task_get_current()} function.
 
 @node Codelet feedback
 @subsection Per-codelet feedback

+ 15 - 0
examples/Makefile.am

@@ -829,6 +829,21 @@ pi_pi_redux_LDADD =				\
 	$(STARPU_CURAND_LDFLAGS)
 endif
 
+###########################
+# OpenGL interoperability #
+###########################
+
+if HAVE_OPENGL
+examplebin_PROGRAMS +=				\
+	gl_interop/gl_interop
+
+gl_interop_gl_interop_SOURCES =			\
+	gl_interop/gl_interop.c
+
+gl_interop_gl_interop_LDADD =			\
+	$(STARPU_OPENGL_RENDER_LDFLAGS)
+endif
+
 showcheck:
 	-cat $(TEST_LOGS) /dev/null
 	for i in $(SUBDIRS) ; do \

+ 2 - 2
examples/common/blas.c

@@ -30,7 +30,7 @@
 #ifdef STARPU_ATLAS
 
 inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
-			float alpha, float *A, int lda, float *B, int ldb, 
+			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float beta, float *C, int ldc)
 {
 	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
@@ -241,7 +241,7 @@ void DSWAP(const int n, double *x, const int incx, double *y, const int incy)
 #elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL)
 
 inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
-			float alpha, float *A, int lda, float *B, int ldb, 
+			float alpha, const float *A, int lda, const float *B, int ldb, 
 			float beta, float *C, int ldc)
 {
 	sgemm_(transa, transb, &M, &N, &K, &alpha,

+ 2 - 2
examples/common/blas.h

@@ -24,8 +24,8 @@
 #include <cblas.h>
 #endif
 
-void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, float *A, int lda, 
-		float *B, int ldb, float beta, float *C, int ldc);
+void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
+		const float *B, int ldb, float beta, float *C, int ldc);
 void DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
 		double *B, int ldb, double beta, double *C, int ldc);
 void SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,

+ 1 - 0
examples/filters/fvector.c

@@ -101,6 +101,7 @@ int main(int argc, char **argv)
 	return 0;
 
 enodev:
+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
 	starpu_shutdown();
 	return 77;
 }

+ 130 - 0
examples/gl_interop/gl_interop.c

@@ -0,0 +1,130 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example demonstrates how to use StarPU combined with OpenGL rendering,
+ * which needs:
+ *
+ * - initializing GLUT first,
+ * - enabling it at initialization,
+ * - running the corresponding CUDA worker in the GLUT thread (here, the main
+ *   thread).
+ */
+
+#include <starpu.h>
+#include <unistd.h>
+#include <GL/glut.h>
+
+void dummy(void *buffers[], void *cl_arg)
+{
+	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+
+	printf("Codelet running\n");
+	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
+	printf("Codelet done\n");
+}
+
+struct starpu_codelet cl = {
+	.where = STARPU_CUDA,
+	.cuda_funcs = { dummy, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_W },
+};
+
+void foo(void) {
+}
+
+void display(float i) {
+	glClear(GL_COLOR_BUFFER_BIT);
+	glColor3f(1, 1, 1);
+	glBegin(GL_LINES);
+	glVertex2f(-i, -i);
+	glVertex2f(i, i);
+	glEnd();
+	glFinish();
+	glutPostRedisplay();
+	glutMainLoopEvent();
+}
+
+void callback_func(void *foo) {
+	printf("Callback running, rendering\n");
+	float i = 1.;
+	while (i > 0) {
+		usleep(100000);
+		display(i);
+		i -= 0.1;
+	}
+	printf("rendering done\n");
+
+	/* Tell it was already the last submitted task */
+	starpu_set_end_of_submissions();
+}
+
+int main(int argc, char **argv)
+{
+#if !(defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
+	return 77;
+#else
+	struct starpu_conf conf;
+	int cuda_device = 0;
+	int cuda_devices[] = { cuda_device };
+	struct starpu_driver drivers[] = {
+		{ .type = STARPU_CUDA_WORKER, .id.cuda_id = cuda_device }
+	};
+	int ret;
+	struct starpu_task *task;
+	starpu_data_handle_t handle;
+
+	glutInit(&argc, argv);
+	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
+	glutInitWindowPosition(0, 0);
+	glutInitWindowSize(300,200);
+	glutCreateWindow("StarPU OpenGL interoperability test");
+	glClearColor (0.5, 0.5, 0.5, 0.0);
+
+	/* Enable OpenGL interoperability */
+	starpu_conf_init(&conf);
+	conf.ncuda = 1;
+	conf.ncpus = 0;
+	conf.nopencl = 0;
+	conf.cuda_opengl_interoperability = cuda_devices;
+	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
+	conf.not_launched_drivers = drivers;
+	conf.n_not_launched_drivers = sizeof(drivers) / sizeof(*drivers);
+	ret = starpu_init(&conf);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_vector_data_register(&handle, -1, 0, 10, sizeof(float));
+
+	/* Submit just one dumb task */
+	task = starpu_task_create();
+	task->cl = &cl;
+	task->handles[0] = handle;
+	task->callback_func = callback_func;
+	task->callback_arg = NULL;
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* And run the driver, which will run the task */
+	printf("running the driver\n");
+	starpu_run_driver(&drivers[0]);
+	printf("finished running the driver\n");
+
+	starpu_shutdown();
+
+	return 0;
+#endif
+}

+ 57 - 29
examples/interface/complex_interface.c

@@ -165,44 +165,52 @@ static uint32_t complex_footprint(starpu_data_handle_t handle)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind)
+static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
 
-	cudaError_t cures;
+	cudaStream_t sstream = stream;
+	int ret;
 
-	cures = cudaMemcpy((void *)dst_complex->real, (void *)src_complex->real, src_complex->nx*sizeof(src_complex->real[0]), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	ret = starpu_cuda_copy_async_sync((void *)src_complex->real, src_node, (void *)dst_complex->real, dst_node,
+					  src_complex->nx*sizeof(src_complex->real[0]), sstream, kind);
+	if (ret == 0) sstream = NULL;
 
-	cures = cudaMemcpy((char *)dst_complex->imaginary, (char *)src_complex->imaginary, src_complex->nx*sizeof(src_complex->imaginary[0]), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	return 0;
+	ret = starpu_cuda_copy_async_sync((char *)src_complex->imaginary, src_node, (char *)dst_complex->imaginary, dst_node,
+					  src_complex->nx*sizeof(src_complex->imaginary[0]), sstream, kind);
+	return ret;
 }
 
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+     return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
+}
+
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
 }
 
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
 }
-#endif
 
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
+}
+#endif
 
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
-                              void *dst_interface, unsigned dst_node)
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
-
+	cl_event *event = (cl_event *)_event;
 	cl_int err;
+	int ret;
 
 	err = starpu_opencl_copy_ram_to_opencl(src_complex->real,
 					       src_node,
@@ -210,10 +218,12 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
 					       dst_node,
 					       src_complex->nx * sizeof(src_complex->real[0]),
 					       0,
-					       NULL,
-					       NULL);
+					       event,
+					       &ret);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
+	if (ret == 0)
+		event = NULL;
 
 	err = starpu_opencl_copy_ram_to_opencl(src_complex->imaginary,
 					       src_node,
@@ -221,31 +231,39 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
 					       dst_node,
 					       src_complex->nx * sizeof(src_complex->imaginary[0]),
 					       0,
-					       NULL,
-					       NULL);
+					       event,
+					       &ret);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	return 0;
+	return ret;
 }
 
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
-			      void *dst_interface, unsigned dst_node)
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
+}
+
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
-
+	cl_event *event = (cl_event *)_event;
 	cl_int err;
+	int ret;
+
 	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->real,
 					       src_node,
 					       dst_complex->real,
 					       dst_node,
 					       src_complex->nx * sizeof(src_complex->real[0]),
 					       0,
-					       NULL,
-					       NULL);
+					       event,
+					       &ret);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
+	if (ret == 0)
+		event = NULL;
 
 	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->imaginary,
 					       src_node,
@@ -253,23 +271,33 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
 					       dst_node,
 					       src_complex->nx * sizeof(src_complex->imaginary[0]),
 					       0,
-					       NULL,
-					       NULL);
+					       event,
+					       &ret);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	return 0;
+	return ret;
+}
+
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
 }
 #endif
+
 static struct starpu_data_copy_methods complex_copy_methods =
 {
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
+	.ram_to_cuda_async = copy_ram_to_cuda_async,
+	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
 	.opencl_to_ram = copy_opencl_to_ram,
+	.ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
 };
 

+ 7 - 7
examples/matvecmult/matvecmult.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -126,12 +126,12 @@ int main(int argc, char **argv)
 {
 	struct starpu_codelet cl = {};
 
-	struct starpu_conf conf =
-	{
-		.ncpus = 0,
-		.ncuda = 0,
-                .nopencl = 1,
-	};
+	struct starpu_conf conf;
+	
+	starpu_conf_init(&conf);
+	conf.ncpus = 0;
+	conf.ncuda = 0;
+	conf.nopencl = 1;
 
         /* int width=1100; */
         /* int height=244021; */

+ 4 - 15
examples/scheduler/dummy_sched.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -127,20 +127,6 @@ static struct starpu_sched_policy dummy_sched_policy =
 	.policy_description = "dummy scheduling strategy"
 };
 
-static struct starpu_conf conf =
-{
-	.sched_policy_name = NULL,
-	.sched_policy = &dummy_sched_policy,
-	.ncpus = -1,
-	.ncuda = -1,
-        .nopencl = -1,
-	.nspus = -1,
-	.use_explicit_workers_bindid = 0,
-	.use_explicit_workers_cuda_gpuid = 0,
-	.use_explicit_workers_opencl_gpuid = 0,
-	.calibrate = 0
-};
-
 static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
 {
 }
@@ -160,7 +146,10 @@ int main(int argc, char **argv)
 {
 	int ntasks = NTASKS;
 	int ret;
+	struct starpu_conf conf;
 
+	starpu_conf_init(&conf);
+	conf.sched_policy = &dummy_sched_policy,
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV)
 		return 77;

+ 2 - 1
gcc-plugin/src/Makefile.am

@@ -25,7 +25,8 @@ AM_YFLAGS = -y
 
 AM_CPPFLAGS =						\
   -I$(top_srcdir)/include				\
-  -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1
+  -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1 \
+  $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 
 AM_LDFLAGS = -module --tag="$(GCC_FOR_PLUGIN_LIBTOOL_TAG)"
 

+ 11 - 7
gcc-plugin/src/c-expr.y

@@ -109,7 +109,8 @@
   TK (CPP_MULT)					\
   TK (CPP_DIV)					\
   TK (CPP_DOT)					\
-  TK (CPP_DEREF)
+  TK (CPP_DEREF)				\
+  TK (CPP_STRING)
 
 #ifndef __cplusplus
 
@@ -165,6 +166,7 @@
 %token YCPP_DIV "/"
 %token YCPP_DOT "."
 %token YCPP_DEREF "->"
+%token YCPP_STRING "string"
 
 %% /* Grammar rules.  */
 
@@ -184,15 +186,15 @@ sequence: expression {
       }
 ;
 
-expression: identifier | binary_expression | unary_expression;
+expression: binary_expression
+;
 
 /* XXX: `ensure_bound' below leads to errors raised even for non-significant
    arguments---e.g., junk after pragma.  */
 identifier: YCPP_NAME  { $$ = ensure_bound (loc, $1); }
 ;
 
-binary_expression: multiplicative_expression
-     | additive_expression
+binary_expression: additive_expression
 ;
 
 multiplicative_expression: multiplicative_expression YCPP_MULT cast_expression {
@@ -217,9 +219,7 @@ cast_expression: unary_expression
 		 /* XXX: No support for '(' TYPE-NAME ')' UNARY-EXPRESSION.  */
 ;
 
-unary_expression:
-       primary_expression
-     | postfix_expression
+unary_expression: postfix_expression
      | YCPP_AND cast_expression {
        $$ = build_addr (ensure_bound (loc, $2), current_function_decl);
      }
@@ -250,10 +250,14 @@ postfix_expression:
 
 primary_expression: identifier
      | constant
+     | string_literal
      | YCPP_OPEN_PAREN expression YCPP_CLOSE_PAREN { $$ = $2; }
 ;
 
 constant: YCPP_NUMBER { $$ = $1; }
 ;
 
+string_literal: YCPP_STRING { $$ = $1; }
+;
+
 %%

+ 17 - 4
gcc-plugin/src/starpu.c

@@ -18,6 +18,9 @@
 #define _GNU_SOURCE 1
 
 #include <starpu-gcc-config.h>
+/* We must include starpu.h here, otherwise gcc will complain about a poisoned
+   malloc in xmmintrin.h. */
+#include <starpu.h>  /* for `STARPU_CPU' & co.  */
 
 /* #define ENABLE_TREE_CHECKING 1 */
 
@@ -57,7 +60,6 @@
    disclaimer.  */
 #define STARPU_DONT_INCLUDE_CUDA_HEADERS
 
-#include <starpu.h>  /* for `STARPU_CPU' & co.  */
 
 
 /* GCC 4.7 requires compilation with `g++', and C++ lacks a number of GNU C
@@ -94,6 +96,11 @@
 extern int yyparse (location_t, const char *, tree *);
 extern int yydebug;
 
+/* This declaration is from `c-tree.h', but that header doesn't get
+   installed.  */
+
+extern tree xref_tag (enum tree_code, tree);
+
 
 #ifdef __cplusplus
 extern "C" {
@@ -124,7 +131,7 @@ static const char heap_allocated_orig_type_attribute_name[] =
   ".heap_allocated_original_type";
 
 /* Names of data structures defined in <starpu.h>.  */
-static const char codelet_struct_name[] = "starpu_codelet_gcc";
+static const char codelet_struct_tag[] = "starpu_codelet";
 
 /* Cached function declarations.  */
 static tree unpack_fn, data_lookup_fn;
@@ -1940,8 +1947,14 @@ codelet_type (void)
       /* Lookup the `struct starpu_codelet' struct type.  This should succeed since
 	 we push <starpu.h> early on.  */
 
-      type_decl = lookup_name (get_identifier (codelet_struct_name));
-      gcc_assert (type_decl != NULL_TREE && TREE_CODE (type_decl) == TYPE_DECL);
+      type_decl = xref_tag (RECORD_TYPE, get_identifier (codelet_struct_tag));
+      gcc_assert (type_decl != NULL_TREE
+		  && TREE_CODE (type_decl) == RECORD_TYPE);
+
+      /* `build_decl' expects a TYPE_DECL, so give it what it wants.  */
+
+      type_decl = TYPE_STUB_DECL (type_decl);
+      gcc_assert (type_decl != NULL && TREE_CODE (type_decl) == TYPE_DECL);
     }
 
   return TREE_TYPE (type_decl);

+ 6 - 1
gcc-plugin/tests/register.c

@@ -172,7 +172,12 @@ main (int argc, char *argv[])
   expected_register_arguments.element_size = sizeof m3d[0];
 #pragma starpu register m3d
 
-  assert (data_register_calls == 17);
+  expected_register_arguments.pointer = "hello";
+  expected_register_arguments.elements = sizeof "hello";
+  expected_register_arguments.element_size = 1;
+#pragma starpu register "hello"
+
+  assert (data_register_calls == 18);
 
   free (y);
 

+ 48 - 10
include/starpu.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -51,8 +51,45 @@ extern "C"
 {
 #endif
 
+#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
+#include <starpu_opencl.h>
+#endif
+
+enum starpu_archtype
+{
+	STARPU_CPU_WORKER,    /* CPU core */
+	STARPU_CUDA_WORKER,   /* NVIDIA CUDA device */
+	STARPU_OPENCL_WORKER, /* OpenCL device */
+	STARPU_GORDON_WORKER  /* Cell SPU */
+};
+
+struct starpu_driver
+{
+	enum starpu_archtype type;
+	union
+	{
+		unsigned cuda_id;
+#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
+		cl_device_id opencl_id;
+#endif
+		/*
+		 * TODO: handle CPUs:
+		 * 1) Add a member to this union.
+		 * 2) Edit _starpu_launch_drivers() to make sure the driver is
+		 *    not always launched.
+		 * 3) Edit starpu_run_driver() so that it can handle another
+		 *    kind of architecture.
+		 * 4) Write _starpu_run_foobar() in the corresponding driver.
+		 * 5) Test the whole thing :)
+		 */
+	} id;
+};
+
 struct starpu_conf
 {
+	/* Will be initialized by starpu_conf_init */
+	int magic;
+
 	/* which scheduling policy should be used ? (NULL for default) */
 	const char *sched_policy_name;
 	struct starpu_sched_policy *sched_policy;
@@ -83,6 +120,14 @@ struct starpu_conf
 
         /* indicate if the asynchronous copies should be disabled */
 	int disable_asynchronous_copy;
+
+	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
+	int *cuda_opengl_interoperability;
+	unsigned n_cuda_opengl_interoperability;
+
+	/* A driver that the application will run in one of its own threads. */
+	struct starpu_driver *not_launched_drivers;
+	unsigned n_not_launched_drivers;
 };
 
 /* Initialize a starpu_conf structure with default values. */
@@ -119,15 +164,6 @@ int starpu_combined_worker_get_id(void);
 int starpu_combined_worker_get_size(void);
 int starpu_combined_worker_get_rank(void);
 
-enum starpu_archtype
-{
-	STARPU_CPU_WORKER, /* CPU core */
-	STARPU_CUDA_WORKER, /* NVIDIA CUDA device */
-	STARPU_OPENCL_WORKER, /* OpenCL CUDA device */
-	STARPU_GORDON_WORKER, /* Cell SPU */
-	STARPU_ALL
-};
-
 /* This function returns the type of worker associated to an identifier (as
  * returned by the starpu_worker_get_id function). The returned value indicates
  * the architecture of the worker: STARPU_CPU_WORKER for a CPU core,
@@ -166,6 +202,8 @@ void starpu_worker_get_name(int id, char *dst, size_t maxlen);
 int starpu_worker_get_devid(int id);
 void starpu_profiling_init();
 	void starpu_display_stats();
+int starpu_run_driver(struct starpu_driver *);
+void starpu_set_end_of_submissions(void);
 #ifdef __cplusplus
 }
 #endif

+ 6 - 2
include/starpu_cuda.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,6 +44,10 @@ cudaStream_t starpu_cuda_get_local_stream(void);
 
 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
 
+int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
+
+void starpu_cuda_set_device(int devid);
+
 #ifdef __cplusplus
 }
 #endif

+ 4 - 5
include/starpu_task.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
@@ -116,10 +116,6 @@ struct starpu_codelet
 	const char *name;
 };
 
-#ifdef STARPU_GCC_PLUGIN
-typedef struct starpu_codelet starpu_codelet_gcc;
-#endif /* STARPU_GCC_PLUGIN */
-
 struct starpu_task
 {
 	struct starpu_codelet *cl;
@@ -278,6 +274,9 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);
 /* The application can feed a tag explicitely */
 void starpu_tag_notify_from_apps(starpu_tag_t id);
 
+/* To reuse a tag not associated with a task */
+void starpu_tag_restart(starpu_tag_t id);
+
 /* To release resources, tags should be freed after use */
 void starpu_tag_remove(starpu_tag_t id);
 

+ 10 - 0
src/core/dependencies/tags.c

@@ -213,6 +213,16 @@ void _starpu_notify_tag_dependencies(struct _starpu_tag *tag)
 	_starpu_spin_unlock(&tag->lock);
 }
 
+void starpu_tag_restart(starpu_tag_t id)
+{
+	struct _starpu_tag *tag = gettag_struct(id);
+
+	_starpu_spin_lock(&tag->lock);
+	STARPU_ASSERT_MSG(tag->state == STARPU_DONE, "Only completed tags can be restarted");
+	tag->state = STARPU_BLOCKED;
+	_starpu_spin_unlock(&tag->lock);
+}
+
 void starpu_tag_notify_from_apps(starpu_tag_t id)
 {
 	struct _starpu_tag *tag = gettag_struct(id);

+ 7 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -92,8 +92,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	_starpu_bind_thread_on_cpu(config, cpu);
 	size_t size = SIZE;
 
-	/* Initiliaze CUDA context on the device */
-	cudaSetDevice(dev);
+	/* Initialize CUDA context on the device */
+	starpu_cuda_set_device(dev);
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -185,8 +185,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
 
-	/* Initiliaze CUDA context on the source */
-	cudaSetDevice(src);
+	/* Initialize CUDA context on the source */
+	starpu_cuda_set_device(src);
 
 	/* Allocate a buffer on the device */
 	unsigned char *s_buffer;
@@ -194,8 +194,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	STARPU_ASSERT(s_buffer);
 	cudaMemset(s_buffer, 0, size);
 
-	/* Initiliaze CUDA context on the destination */
-	cudaSetDevice(dst);
+	/* Initialize CUDA context on the destination */
+	starpu_cuda_set_device(dst);
 
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
@@ -222,7 +222,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
 	/* Free buffers */
 	cudaFree(d_buffer);
-	cudaSetDevice(src);
+	starpu_cuda_set_device(src);
 	cudaFree(s_buffer);
 
 	cudaThreadExit();

+ 27 - 1
src/core/task.c

@@ -633,10 +633,15 @@ int starpu_task_wait_for_no_ready(void)
 
 void _starpu_decrement_nsubmitted_tasks(void)
 {
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 
-	if (--nsubmitted == 0)
+	if (--nsubmitted == 0) {
+		if (!config->submitting)
+			config->running = 0;
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
+	}
 
 	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
 
@@ -644,6 +649,27 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
 }
 
+void
+starpu_set_end_of_submissions(void)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+
+	config->submitting = 0;
+	if (nsubmitted == 0) {
+		config->running = 0;
+		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+}
+
+void _starpu_check_nsubmitted_tasks(void)
+{
+
+}
+
 static void _starpu_increment_nsubmitted_tasks(void)
 {
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);

+ 112 - 3
src/core/workers.c

@@ -209,15 +209,55 @@ static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 	_starpu_memory_node_register_condition(cond, mutex, memory_node);
 }
 
+/*
+ * Returns 0 if the given driver is one of the drivers that must be launched by
+ * the application itself, and not by StarPU, 1 otherwise.
+ */
+static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
+					  struct starpu_driver *d)
+{
+	if (conf->n_not_launched_drivers == 0 ||
+	    conf->not_launched_drivers == NULL)
+		return 1;
+
+	/* Is <d> in conf->not_launched_drivers ? */
+	unsigned i;
+	for (i = 0; i < conf->n_not_launched_drivers; i++)
+	{
+		if (d->type != conf->not_launched_drivers[i].type)
+			continue;
+
+		switch (d->type)
+		{
+		case STARPU_CUDA_WORKER:
+			if (d->id.cuda_id == conf->not_launched_drivers[i].id.cuda_id)
+				return 0;
+			break;
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_WORKER:
+			if (d->id.opencl_id == conf->not_launched_drivers[i].id.opencl_id)
+				return 0;
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+		}
+	}
+
+	return 1;
+}
+
 static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 {
 	config->running = 1;
+	config->submitting = 1;
 
 	pthread_key_create(&worker_key, NULL);
 
 	unsigned nworkers = config->topology.nworkers;
 
 	/* Launch workers asynchronously (except for SPUs) */
+	unsigned cuda = 0;
 	unsigned worker;
 	for (worker = 0; worker < nworkers; worker++)
 	{
@@ -256,6 +296,8 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
 		_starpu_init_worker_queue(workerarg);
 
+		struct starpu_driver driver;
+		driver.type = workerarg->arch;
 		switch (workerarg->arch)
 		{
 #ifdef STARPU_USE_CPU
@@ -270,13 +312,20 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_CUDA_WORKER:
 				workerarg->set = NULL;
 				workerarg->worker_is_initialized = 0;
-				pthread_create(&workerarg->worker_thread,
-						NULL, _starpu_cuda_worker, workerarg);
-
+				driver.id.cuda_id = cuda;
+				if (_starpu_may_launch_driver(config->conf, &driver))
+				{
+					pthread_create(&workerarg->worker_thread,
+						       NULL, _starpu_cuda_worker, workerarg);
+				}
+				cuda++;
 				break;
 #endif
 #ifdef STARPU_USE_OPENCL
 			case STARPU_OPENCL_WORKER:
+				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
+				if (!_starpu_may_launch_driver(config->conf, &driver))
+					break;
 				workerarg->set = NULL;
 				workerarg->worker_is_initialized = 0;
 				pthread_create(&workerarg->worker_thread,
@@ -318,20 +367,45 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 		}
 	}
 
+	cuda = 0;
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &config->workers[worker];
+		struct starpu_driver driver;
+		driver.type = workerarg->arch;
 
 		switch (workerarg->arch)
 		{
 			case STARPU_CPU_WORKER:
+				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_initialized)
+					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
+				_STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+				break;
 			case STARPU_CUDA_WORKER:
+				driver.id.cuda_id = cuda;
+				if (!_starpu_may_launch_driver(config->conf, &driver))
+				{
+					cuda++;
+					break;
+				}
+				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_initialized)
+					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
+				_STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+				cuda++;
+				break;
+#ifdef STARPU_USE_OPENCL
 			case STARPU_OPENCL_WORKER:
+				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
+				if (!_starpu_may_launch_driver(config->conf, &driver))
+					break;
 				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
 				while (!workerarg->worker_is_initialized)
 					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
 				_STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 				break;
+#endif
 #ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 				/* the initialization of Gordon worker is
@@ -361,6 +435,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 	if (!conf)
 		return -EINVAL;
 
+	memset(conf, 0, sizeof(*conf));
+	conf->magic = 42;
 	conf->sched_policy_name = getenv("STARPU_SCHED");
 	conf->sched_policy = NULL;
 
@@ -497,6 +573,10 @@ int starpu_init(struct starpu_conf *user_conf)
 	}
 	else
 	{
+	     if (user_conf->magic != 42) {
+		fprintf(stderr, "starpu_conf structure needs to be initialized with starpu_conf_init\n");
+		return -EINVAL;
+	     }
 	     config.conf = user_conf;
 	     config.default_conf = 0;
 	}
@@ -1001,3 +1081,32 @@ struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void)
 	return &config.sched_ctxs[0];
 }
 
+#ifdef STARPU_USE_CUDA
+extern int _starpu_run_cuda(struct starpu_driver *);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern int _starpu_run_opencl(struct starpu_driver *);
+#endif
+
+int
+starpu_run_driver(struct starpu_driver *d)
+{
+	if (!d)
+		return -EINVAL;
+
+	switch (d->type)
+	{
+#ifdef STARPU_USE_CUDA
+	case STARPU_CUDA_WORKER:
+		return _starpu_run_cuda(d);
+#endif
+#ifdef STARPU_USE_OPENCL
+	case STARPU_OPENCL_WORKER:
+		return _starpu_run_opencl(d);
+#endif
+	case STARPU_CPU_WORKER:    /* Not supported yet */
+	case STARPU_GORDON_WORKER: /* Not supported yet */
+	default:
+		return -EINVAL;
+	}
+}

+ 4 - 1
src/core/workers.h

@@ -69,7 +69,7 @@ struct _starpu_worker
 	int current_rank; /* current rank in case the worker is used in a parallel fashion */
 	int worker_size; /* size of the worker in case we use a combined worker */
         pthread_cond_t ready_cond; /* indicate when the worker is ready */
-	unsigned memory_node; /* which memory node is associated that worker to ? */
+	unsigned memory_node; /* which memory node is the worker associated with ? */
 	pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
 	pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
@@ -173,6 +173,9 @@ struct _starpu_machine_config
 	/* this flag is set until the runtime is stopped */
 	unsigned running;
 
+	/* this flag is set until the application is finished submitting tasks */
+	unsigned submitting;
+
 	/* all the sched ctx of the current instance of starpu */
 	struct _starpu_sched_ctx sched_ctxs[STARPU_NMAX_SCHED_CTXS];
 };

+ 1 - 2
src/datawizard/copy_driver.c

@@ -117,8 +117,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
 	{
 		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
-		cures = cudaSetDevice(_starpu_memory_node_to_devid(node));
-		STARPU_ASSERT(cures == cudaSuccess);
+		starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
 	}
 #endif
 

+ 2 - 22
src/datawizard/interfaces/block_interface.c

@@ -423,10 +423,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-                        cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
-                                           nx*ny*nz*elemsize, kind);
-                        if (STARPU_UNLIKELY(cures))
-                                STARPU_CUDA_REPORT_ERROR(cures);
+			starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*elemsize, NULL, kind);
                 }
 		else
 		{
@@ -482,24 +479,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, kind, stream);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-			if (STARPU_UNLIKELY(cures))
-			{
-				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, kind);
-				if (STARPU_UNLIKELY(cures))
-					STARPU_CUDA_REPORT_ERROR(cures);
-
-				ret = 0;
-			}
-			else
-			{
-				ret = -EAGAIN;
-			}
-
+			ret = starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*elemsize, stream, kind);
 		}
 		else
 		{

+ 15 - 83
src/datawizard/interfaces/csr_interface.c

@@ -416,7 +416,7 @@ static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
+static int copy_cuda_async_sync(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
 {
 	struct starpu_csr_interface *src_csr = src_interface;
 	struct starpu_csr_interface *dst_csr = dst_interface;
@@ -425,87 +425,19 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	uint32_t nrow = src_csr->nrow;
 	size_t elemsize = src_csr->elemsize;
 
-	cudaError_t cures;
+	cudaStream_t sstream = stream;
+	int ret;
 
-	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	ret = starpu_cuda_copy_async_sync((void *)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, sstream, kind);
+	if (ret == 0) sstream = NULL;
 
-	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	ret = starpu_cuda_copy_async_sync((void *)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), sstream, kind);
+	if (ret == 0) sstream = NULL;
 
-	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	ret = starpu_cuda_copy_async_sync((void *)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), sstream, kind);
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
-{
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-	cudaError_t cures;
-
-	int synchronous_fallback = 0;
-
-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-	cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
-	if (cures)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (!synchronous_fallback)
-	{
-		cures = cudaMemcpyAsync((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind, stream);
-	}
-
-	if (synchronous_fallback || cures != cudaSuccess)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (!synchronous_fallback)
-	{
-		cures = cudaMemcpyAsync((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind, stream);
-	}
-
-	if (synchronous_fallback || cures != cudaSuccess)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (synchronous_fallback)
-	{
-		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-		return 0;
-	}
-	else
-	{
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		return -EAGAIN;
-	}
+	return ret;
 }
 
 static int copy_cuda_peer(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
@@ -620,36 +552,36 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
 }
 
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
 }
 
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
 	if (src_node == dst_node)
-		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, NULL);
 	else
 		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node);
 }
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
-	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
 }
 
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
-	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
 }
 
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
 	if (src_node == dst_node)
-		return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
+		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
 	else
 		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
 }

+ 2 - 4
src/datawizard/interfaces/matrix_interface.c

@@ -456,16 +456,14 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	/* That code is not even working!! */
 	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
 
-	cures = cudaSetDevice(src_dev);
-	STARPU_ASSERT(cures == cudaSuccess);
+	starpu_cuda_set_device(src_dev);
 
 	struct cudaPitchedPtr mem_device1;
 	cures = cudaMalloc3D(&mem_device1, extent);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaSetDevice(dst_dev);
-	STARPU_ASSERT(cures == cudaSuccess);
+	starpu_cuda_set_device(dst_dev);
 
 	struct cudaPitchedPtr mem_device2;
 	cures = cudaMalloc3D(&mem_device2, extent);

+ 16 - 57
src/datawizard/interfaces/variable_interface.c

@@ -286,42 +286,32 @@ static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
+static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 	struct starpu_variable_interface *src_variable = src_interface;
 	struct starpu_variable_interface *dst_variable = dst_interface;
+	int ret;
 
-	cudaError_t cures;
-	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind);
-
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
+	ret = starpu_cuda_copy_async_sync((void *)src_variable->ptr, src_node, (void *)dst_variable->ptr, dst_node, src_variable->elemsize, stream, kind);
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
-
-	return 0;
+	return ret;
 }
 
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
 	if (src_node == dst_node)
 	{
-		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToDevice);
 	}
 	else
 	{
@@ -347,51 +337,21 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 	}
 }
 
-static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
-					cudaStream_t stream, enum cudaMemcpyKind kind)
-{
-	struct starpu_variable_interface *src_variable = src_interface;
-	struct starpu_variable_interface *dst_variable = dst_interface;
-
-	cudaError_t cures;
-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, stream);
-	_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-	if (cures)
-	{
-		/* do it in a synchronous fashion */
-		cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind);
-
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
-		return 0;
-	}
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
-
-	return -EAGAIN;
-}
-
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
 	if (src_node == dst_node)
 	{
-		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
+		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
 	}
 	else
 	{
@@ -429,7 +389,6 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					v
 	}
 }
 
-
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL

+ 15 - 53
src/datawizard/interfaces/vector_interface.c

@@ -331,21 +331,15 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
+static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 	struct starpu_vector_interface *src_vector = src_interface;
 	struct starpu_vector_interface *dst_vector = dst_interface;
+	int ret;
 
-	cudaError_t cures;
-
-	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
+	ret = starpu_cuda_copy_async_sync((void *)src_vector->ptr, src_node, (void *)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize, stream, kind);
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
-
-	return 0;
+	return ret;
 }
 
 #ifdef HAVE_CUDA_MEMCPY_PEER
@@ -385,24 +379,21 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 }
 #endif
 
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
 	if (src_node == dst_node)
 	{
-		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToDevice);
 	}
 	else
 	{
@@ -416,38 +407,11 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 	}
 }
 
-static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
-					cudaStream_t stream, enum cudaMemcpyKind kind)
-{
-	struct starpu_vector_interface *src_vector = src_interface;
-	struct starpu_vector_interface *dst_vector = dst_interface;
-
-	cudaError_t cures;
-
-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
-	_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-	if (cures)
-	{
-		/* do it in a synchronous fashion */
-		cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
-		return 0;
-	}
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
-
-	return -EAGAIN;
-}
-
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
 	if (src_node == dst_node)
 	{
-		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
+		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
 	}
 	else
 	{
@@ -461,16 +425,14 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void
 	}
 }
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
+	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
 
 #endif // STARPU_USE_CUDA

+ 2 - 4
src/datawizard/memalloc.c

@@ -249,8 +249,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, uint32_t node)
 			 * proper CUDA device in case it is needed. This avoids
 			 * having to set it again in the free method of each
 			 * interface. */
-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(node));
-			STARPU_ASSERT(err == cudaSuccess);
+			starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
 		}
 #endif
 
@@ -792,8 +791,7 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _s
 			 * proper CUDA device in case it is needed. This avoids
 			 * having to set it again in the malloc method of each
 			 * interface. */
-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(dst_node));
-			STARPU_ASSERT(err == cudaSuccess);
+			starpu_cuda_set_device(_starpu_memory_node_to_devid(dst_node));
 		}
 #endif
 

+ 0 - 1
src/drivers/cpu/driver_cpu.c

@@ -262,6 +262,5 @@ void *_starpu_cpu_worker(void *arg)
 
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CPU_KEY);
 
-	pthread_exit(NULL);
 	return NULL;
 }

+ 81 - 10
src/drivers/cuda/driver_cuda.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  *
@@ -27,7 +27,7 @@
 #include <drivers/driver_common/driver_common.h>
 #include "driver_cuda.h"
 #include <core/sched_policy.h>
-#include <core/sched_ctx.h>
+#include <cuda_gl_interop.h>
 
 /* the number of CUDA devices */
 static int ncudagpus;
@@ -110,14 +110,38 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 	return &props[devid];
 }
 
-static void init_context(int devid)
+void starpu_cuda_set_device(int devid)
 {
 	cudaError_t cures;
-	int workerid = starpu_worker_get_id();
+	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
+	unsigned i;
+
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	if (conf->n_cuda_opengl_interoperability) {
+		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
+		STARPU_ASSERT(0);
+	}
+#else
+	for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
+		if (conf->cuda_opengl_interoperability[i] == devid) {
+			cures = cudaGLSetGLDevice(devid);
+			goto done;
+		}
+#endif
 
 	cures = cudaSetDevice(devid);
+
+done:
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
+}
+
+static void init_context(int devid)
+{
+	cudaError_t cures;
+	int workerid = starpu_worker_get_id();
+
+	starpu_cuda_set_device(devid);
 
 	/* force CUDA to initialize the context for real */
 	cures = cudaFree(0);
@@ -233,9 +257,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	/* We make sure we do manipulate the proper device */
-	cures = cudaSetDevice(args->devid);
-	if (STARPU_UNLIKELY(cures != cudaSuccess))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	starpu_cuda_set_device(args->devid);
 #endif
 
 	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
@@ -400,10 +422,7 @@ void *_starpu_cuda_worker(void *arg)
 
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
 
-	pthread_exit(NULL);
-
 	return NULL;
-
 }
 
 void starpu_cublas_report_error(const char *func, const char *file, int line, cublasStatus status)
@@ -446,3 +465,55 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 	printf("oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
 	STARPU_ASSERT(0);
 }
+
+int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
+{
+	cudaError_t cures = 0;
+
+	if (stream)
+	{
+	     _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+	     cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
+	     _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+	}
+	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
+	if (stream == NULL || cures)
+	{
+		/* do it in a synchronous fashion */
+		cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
+
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+
+		return 0;
+	}
+
+	return -EAGAIN;
+}
+
+int _starpu_run_cuda(struct starpu_driver *d)
+{
+	STARPU_ASSERT(d && d->type == STARPU_CUDA_WORKER);
+
+	int workers[d->id.cuda_id + 1];
+	int nworkers;
+	nworkers = starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, workers, d->id.cuda_id+1);
+	if (nworkers >= 0 && (unsigned) nworkers < d->id.cuda_id)
+		return -ENODEV;
+	
+	_STARPU_DEBUG("Running cuda %d from the application\n", d->id.cuda_id);
+
+	struct _starpu_worker *workerarg = _starpu_get_worker_struct(workers[d->id.cuda_id]);
+
+	workerarg->set = NULL;
+	workerarg->worker_is_initialized = 0;
+
+	/* Let's go ! */
+	_starpu_cuda_worker(workerarg);
+
+	/* XXX: Should we wait for the driver to be ready, as it is done when
+	 * launching it the usual way ? Cf. the end of _starpu_launch_drivers()
+	 */
+
+	return 0;
+}

+ 1 - 1
src/drivers/gordon/driver_gordon.c

@@ -488,5 +488,5 @@ void *_starpu_gordon_worker(void *arg)
 	gordon_deinit();
 	_STARPU_DEBUG("gordon was deinited\n");
 
-	pthread_exit((void *)0x42);
+	return NULL;
 }

+ 39 - 2
src/drivers/opencl/driver_opencl.c

@@ -510,8 +510,6 @@ void *_starpu_opencl_worker(void *arg)
 
         _starpu_opencl_deinit_context(devid);
 
-	pthread_exit(NULL);
-
 	return NULL;
 }
 
@@ -595,3 +593,42 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
 	return EXIT_SUCCESS;
 }
+
+int _starpu_run_opencl(struct starpu_driver *d)
+{
+	STARPU_ASSERT(d && d->type == STARPU_OPENCL_WORKER);
+
+	int nworkers;
+	int workers[STARPU_MAXOPENCLDEVS];
+	nworkers = starpu_worker_get_ids_by_type(STARPU_OPENCL_WORKER, workers, STARPU_MAXOPENCLDEVS);
+	if (nworkers == 0)
+		return -ENODEV;
+
+	int i;
+	for (i = 0; i < nworkers; i++)
+	{
+		cl_device_id device;
+		int devid = starpu_worker_get_devid(workers[i]);
+		starpu_opencl_get_device(devid, &device);
+		if (device == d->id.opencl_id)
+			break;
+	}
+
+	if (i == nworkers)
+		return -ENODEV;
+
+	struct _starpu_worker *workerarg = _starpu_get_worker_struct(i);
+	_STARPU_DEBUG("Running OpenCL %d from the application\n", workerarg->devid);
+
+	workerarg->set = NULL;
+	workerarg->worker_is_initialized = 0;
+
+	/* Let's go ! */
+	_starpu_opencl_worker(workerarg);
+
+	/* XXX: Should we wait for the driver to be ready, as it is done when
+	 * launching it the usual way ? Cf. the end of _starpu_launch_drivers()
+	 */
+
+	return 0;
+}

+ 0 - 2
src/drivers/opencl/driver_opencl_utils.c

@@ -103,11 +103,9 @@ cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, str
 {
         cl_int err;
 	cl_device_id device;
-        cl_context context;
         cl_program program;
 
         starpu_opencl_get_device(devid, &device);
-        starpu_opencl_get_context(devid, &context);
         starpu_opencl_get_queue(devid, queue);
 
         program = opencl_programs->programs[devid];

+ 4 - 1
tests/Makefile.am

@@ -130,6 +130,7 @@ noinst_PROGRAMS =				\
 	main/wait_all_regenerable_tasks		\
 	main/subgraph_repeat			\
 	main/subgraph_repeat_regenerate		\
+	main/subgraph_repeat_regenerate_tag	\
 	main/empty_task				\
 	main/empty_task_sync_point		\
 	main/empty_task_sync_point_tasks	\
@@ -141,7 +142,8 @@ noinst_PROGRAMS =				\
 	main/declare_deps_after_submission_synchronous	\
 	main/get_current_task			\
 	main/starpu_init			\
-	main/starpu_worker_exists               \
+	main/starpu_worker_exists		\
+	main/submit				\
 	datawizard/acquire_cb			\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
@@ -207,6 +209,7 @@ noinst_PROGRAMS =				\
 	microbenchs/prefetch_data_on_node 	\
 	microbenchs/redundant_buffer		\
 	microbenchs/local_pingpong		\
+	microbenchs/matrix_as_vector		\
 	overlap/overlap				\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/parallel_kernels		\

+ 3 - 3
tests/datawizard/gpu_register.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -95,7 +95,7 @@ test_cuda(void)
 	size = 10 * n;
 
 	devid = starpu_worker_get_devid(chosen);
-	cudaSetDevice(devid);
+	starpu_cuda_set_device(devid);
 	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
 
 	foo = calloc(size, sizeof(*foo));
@@ -133,7 +133,7 @@ test_cuda(void)
 	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
 	starpu_data_unregister(handle);
 
-	cudaSetDevice(devid);
+	starpu_cuda_set_device(devid);
 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 5 - 6
tests/datawizard/interfaces/bcsr/bcsr_interface.c

@@ -171,12 +171,11 @@ int
 main(void)
 {
 	data_interface_test_summary *summary;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		return STARPU_TEST_SKIPPED;

+ 4 - 6
tests/datawizard/interfaces/block/block_interface.c

@@ -133,12 +133,10 @@ int
 main(void)
 {
 	data_interface_test_summary *summary;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;

+ 5 - 6
tests/datawizard/interfaces/csr/csr_interface.c

@@ -141,12 +141,11 @@ int
 main(void)
 {
 	data_interface_test_summary *summary;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;

+ 4 - 6
tests/datawizard/interfaces/matrix/matrix_interface.c

@@ -116,12 +116,10 @@ int
 main(void)
 {
 	data_interface_test_summary *summary;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;

+ 5 - 6
tests/datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl.c

@@ -105,12 +105,11 @@ main(void)
 {
 #if defined(STARPU_USE_CUDA) && defined(STARPU_USE_OPENCL)
 	int ret;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 1,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	conf.ncuda = 1;
+	conf.nopencl = 1;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV)

+ 5 - 6
tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release.c

@@ -123,12 +123,11 @@ main(void)
 {
 #ifdef STARPU_USE_CPU
 	int ret;
-	struct starpu_conf conf =
-	{
-		.ncpus = -1,
-		.ncuda = 1,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	conf.ncuda = 1;
+	conf.nopencl = 1;
 	memset(&global_stats, 0, sizeof(global_stats));
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;

+ 5 - 6
tests/datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion.c

@@ -203,12 +203,11 @@ main(void)
 {
 #ifdef STARPU_USE_CPU
 	int ret;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 4 - 6
tests/datawizard/interfaces/multiformat/multiformat_interface.c

@@ -136,12 +136,10 @@ main(void)
 #ifdef STARPU_USE_CPU
 	int ret;
 	data_interface_test_summary *summary;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)

+ 5 - 6
tests/datawizard/interfaces/variable/variable_interface.c

@@ -93,12 +93,11 @@ main(void)
 	int ret;
 	data_interface_test_summary *summary;
 
-	struct starpu_conf conf =
-	{
-		.ncpus = -1,
-		.ncuda = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)

+ 4 - 6
tests/datawizard/interfaces/vector/test_vector_interface.c

@@ -103,12 +103,10 @@ int
 main(void)
 {
 	data_interface_test_summary *summary;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;

+ 4 - 6
tests/datawizard/interfaces/void/void_interface.c

@@ -59,12 +59,10 @@ int
 main(void)
 {
 	data_interface_test_summary *summary;
-	struct starpu_conf conf =
-	{
-		.ncpus   = -1,
-		.ncuda   = 2,
-		.nopencl = 1
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.ncuda = 2;
+	conf.nopencl = 1;
 
 	if (starpu_init(&conf) == -ENODEV || starpu_cpu_worker_get_count() == 0)
 		goto enodev;

+ 2 - 1
tests/datawizard/write_only_tmp_buffer.c

@@ -42,6 +42,7 @@ static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_a
 
         starpu_opencl_get_queue(devid, &queue);
         clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, sizeof(char), &ptr, 0, NULL, NULL);
+	clFinish(queue);
 }
 #endif
 
@@ -72,7 +73,7 @@ static void display_var(void *descr[], __attribute__ ((unused)) void *_args)
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	if (*buf != 42)
 	{
-		FPRINTF(stderr, "Value = %c (should be %c)\n", *buf, 42);
+		FPRINTF(stderr, "Value = '%c' (should be '%c')\n", *buf, 42);
 		exit(-1);
 	}
 }

+ 4 - 2
tests/errorcheck/invalid_tasks.c

@@ -48,8 +48,10 @@ int main(int argc, char **argv)
 	unsetenv("STARPU_NOPENCL");
 	unsetenv("STARPU_NCPUS");
 	struct starpu_conf conf;
-	memset(&conf, 0, sizeof(conf));
+	starpu_conf_init(&conf);
 	conf.ncpus = 1;
+	conf.nopencl = 0;
+	conf.ncuda = 0;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
@@ -58,7 +60,7 @@ int main(int argc, char **argv)
 	struct starpu_task *task = starpu_task_create();
 	task->cl = &gpu_only_cl;
 
-	/* Only a CUDA device could execute that task ! */
+	/* Only a GPU device could execute that task ! */
 	ret = starpu_task_submit(task);
 	STARPU_ASSERT(ret == -ENODEV);
 

+ 7 - 13
tests/errorcheck/starpu_init_noworker.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -43,18 +43,12 @@ int main(int argc, char **argv)
 	unset_env_variables();
 
 	/* We try to initialize StarPU without any worker */
-	struct starpu_conf conf =
-	{
-		.sched_policy_name = NULL, /* default */
-		.ncpus = 0,
-		.ncuda = 0,
-                .nopencl = 0,
-		.nspus = 0,
-		.use_explicit_workers_bindid = 0,
-		.use_explicit_workers_cuda_gpuid = 0,
-		.use_explicit_workers_opencl_gpuid = 0,
-		.calibrate = 0
-	};
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.ncpus = 0;
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	conf.nspus = 0;
 
 	/* starpu_init should return -ENODEV */
 	ret = starpu_init(&conf);

+ 1 - 1
tests/experiments/latency/cuda_latency.c

@@ -113,7 +113,7 @@ void *launch_gpu_thread(void *arg)
 	unsigned *idptr = arg;
 	unsigned id = *idptr;
 
-	cudaSetDevice(id);
+	starpu_cuda_set_device(id);
 	cudaFree(0);
 
 	cudaMalloc(&gpu_buffer[id], buffer_size);

+ 7 - 17
tests/main/starpu_task_wait_for_all.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -69,26 +69,13 @@ static int inject_one_task(void)
 	return ret;
 }
 
-static struct starpu_conf conf =
-{
-	.sched_policy_name = NULL,
-	.ncpus = -1,
-	.ncuda = -1,
-        .nopencl = -1,
-	.nspus = -1,
-	.use_explicit_workers_bindid = 0,
-	.use_explicit_workers_cuda_gpuid = 0,
-	.use_explicit_workers_opencl_gpuid = 0,
-	.calibrate = 0
-};
-
 static void usage(char **argv)
 {
 	FPRINTF(stderr, "%s [-i ntasks] [-p sched_policy] [-h]\n", argv[0]);
 	exit(-1);
 }
 
-static void parse_args(int argc, char **argv)
+static void parse_args(int argc, char **argv, struct starpu_conf *conf)
 {
 	int c;
 	while ((c = getopt(argc, argv, "i:p:h")) != -1)
@@ -98,7 +85,7 @@ static void parse_args(int argc, char **argv)
 			ntasks = atoi(optarg);
 			break;
 		case 'p':
-			conf.sched_policy_name = optarg;
+			conf->sched_policy_name = optarg;
 			break;
 		case 'h':
 			usage(argv);
@@ -113,8 +100,11 @@ int main(int argc, char **argv)
 	struct timeval start;
 	struct timeval end;
 	int ret;
+	struct starpu_conf conf;
+
+	starpu_conf_init(&conf);
 
-	parse_args(argc, argv);
+	parse_args(argc, argv, &conf);
 
 #ifdef STARPU_SLOW_MACHINE
 	ntasks /= 10;

+ 173 - 0
tests/main/subgraph_repeat_regenerate_tag.c

@@ -0,0 +1,173 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <sys/time.h>
+#include <starpu.h>
+#include <pthread.h>
+
+#include "../helper.h"
+
+static unsigned niter = 16384;
+
+#define TAG_START 0
+#define TAG_A 1
+#define TAG_B 2
+#define TAG_C 3
+#define TAG_D 4
+
+/*
+ *
+ *		    /-->B--\
+ *		    |      |
+ *	     -----> A      D---\--->
+ *		^   |      |   |
+ *		|   \-->C--/   |
+ *		|              |
+ *		\--------------/
+ *
+ *	- {B, C} depend on A
+ *	- D depends on {B, C}
+ *	- A, B, C and D are resubmitted at the end of the loop (or not)
+ */
+
+static struct starpu_task taskA, taskB, taskC, taskD;
+
+static unsigned loop_cnt = 0;
+static unsigned check_cnt = 0;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+	(void) STARPU_ATOMIC_ADD(&check_cnt, 1);
+}
+
+static struct starpu_codelet dummy_codelet =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {dummy_func, NULL},
+	.cuda_funcs = {dummy_func, NULL},
+	.opencl_funcs = {dummy_func, NULL},
+	.model = NULL,
+	.nbuffers = 0
+};
+
+static void callback_task_D(void *arg __attribute__((unused)))
+{
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	loop_cnt++;
+
+	if (loop_cnt == niter)
+	{
+		/* We are done */
+		taskD.regenerate = 0;
+		_STARPU_PTHREAD_COND_SIGNAL(&cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+	}
+	else
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		/* Let's go for another iteration */
+		starpu_tag_restart((starpu_tag_t) TAG_START);
+		starpu_tag_notify_from_apps((starpu_tag_t)TAG_START);
+	}
+}
+
+int main(int argc, char **argv)
+{
+//	unsigned i;
+//	double timing;
+//	struct timeval start;
+//	struct timeval end;
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Implicit data dependencies and regeneratable tasks are not compatible */
+	starpu_data_set_default_sequential_consistency_flag(0);
+
+	starpu_task_init(&taskA);
+	taskA.cl = &dummy_codelet;
+	taskA.cl_arg = &taskA;
+	taskA.regenerate = 1; /* this task will be explicitely resubmitted if needed */
+	taskA.use_tag = 1;
+	taskA.tag_id = TAG_A;
+
+	starpu_task_init(&taskB);
+	taskB.cl = &dummy_codelet;
+	taskB.cl_arg = &taskB;
+	taskB.regenerate = 1;
+	taskB.use_tag = 1;
+	taskB.tag_id = TAG_B;
+
+	starpu_task_init(&taskC);
+	taskC.cl = &dummy_codelet;
+	taskC.cl_arg = &taskC;
+	taskC.regenerate = 1;
+	taskC.use_tag = 1;
+	taskC.tag_id = TAG_C;
+
+	starpu_task_init(&taskD);
+	taskD.cl = &dummy_codelet;
+	taskD.cl_arg = &taskD;
+	taskD.callback_func = callback_task_D;
+	taskD.regenerate = 1;
+	taskD.use_tag = 1;
+	taskD.tag_id = TAG_D;
+
+	starpu_tag_declare_deps((starpu_tag_t) TAG_A, 1, (starpu_tag_t) TAG_START);
+
+	starpu_tag_declare_deps((starpu_tag_t) TAG_B, 1, (starpu_tag_t) TAG_A);
+	starpu_tag_declare_deps((starpu_tag_t) TAG_C, 1, (starpu_tag_t) TAG_A);
+
+	starpu_tag_declare_deps((starpu_tag_t) TAG_D, 2, (starpu_tag_t) TAG_B, (starpu_tag_t) TAG_C);
+
+	ret = starpu_task_submit(&taskA); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_task_submit(&taskB); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_task_submit(&taskC); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_task_submit(&taskD); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	starpu_tag_notify_from_apps((starpu_tag_t) TAG_START);
+
+	/* Wait for the termination of all loops */
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	if (loop_cnt < niter)
+		_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	STARPU_ASSERT(check_cnt == (4*loop_cnt));
+
+	starpu_shutdown();
+
+	/* Cleanup the statically allocated tasks after shutdown, as StarPU is still working on it after the callback */
+	starpu_task_deinit(&taskA);
+	starpu_task_deinit(&taskB);
+	starpu_task_deinit(&taskC);
+	starpu_task_deinit(&taskD);
+
+	return EXIT_SUCCESS;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}
+

+ 158 - 0
tests/main/subgraph_repeat_tag.c

@@ -0,0 +1,158 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <sys/time.h>
+#include <starpu.h>
+#include <pthread.h>
+
+#include "../helper.h"
+
+static unsigned niter = 16384;
+
+/*
+ *
+ *		    /-->B--\
+ *		    |      |
+ *	     -----> A      D---\--->
+ *		^   |      |   |
+ *		|   \-->C--/   |
+ *		|              |
+ *		\--------------/
+ *
+ *	- {B, C} depend on A
+ *	- D depends on {B, C}
+ *	- A, B, C and D are resubmitted at the end of the loop (or not)
+ */
+
+static struct starpu_task taskA, taskB, taskC, taskD;
+
+static unsigned loop_cnt = 0;
+static unsigned check_cnt = 0;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+	(void) STARPU_ATOMIC_ADD(&check_cnt, 1);
+}
+
+static struct starpu_codelet dummy_codelet =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {dummy_func, NULL},
+	.cuda_funcs = {dummy_func, NULL},
+	.opencl_funcs = {dummy_func, NULL},
+	.model = NULL,
+	.nbuffers = 0
+};
+
+static void callback_task_D(void *arg __attribute__((unused)))
+{
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	loop_cnt++;
+
+	if (loop_cnt == niter)
+	{
+		/* We are done */
+		taskD.regenerate = 0;
+		_STARPU_PTHREAD_COND_SIGNAL(&cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+	}
+	else
+	{
+		int ret;
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		/* Let's go for another iteration */
+		ret = starpu_task_submit(&taskA);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+}
+
+int main(int argc, char **argv)
+{
+//	unsigned i;
+//	double timing;
+//	struct timeval start;
+//	struct timeval end;
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Implicit data dependencies and regeneratable tasks are not compatible */
+	starpu_data_set_default_sequential_consistency_flag(0);
+
+	starpu_task_init(&taskA);
+	taskA.cl = &dummy_codelet;
+	taskA.cl_arg = &taskA;
+	taskA.regenerate = 0; /* this task will be explicitely resubmitted if needed */
+
+	starpu_task_init(&taskB);
+	taskB.cl = &dummy_codelet;
+	taskB.cl_arg = &taskB;
+	taskB.regenerate = 1;
+
+	starpu_task_init(&taskC);
+	taskC.cl = &dummy_codelet;
+	taskC.cl_arg = &taskC;
+	taskC.regenerate = 1;
+
+	starpu_task_init(&taskD);
+	taskD.cl = &dummy_codelet;
+	taskD.cl_arg = &taskD;
+	taskD.callback_func = callback_task_D;
+	taskD.regenerate = 1;
+
+	struct starpu_task *depsBC_array[1] = {&taskA};
+	starpu_task_declare_deps_array(&taskB, 1, depsBC_array);
+	starpu_task_declare_deps_array(&taskC, 1, depsBC_array);
+
+	struct starpu_task *depsD_array[2] = {&taskB, &taskC};
+	starpu_task_declare_deps_array(&taskD, 2, depsD_array);
+
+	ret = starpu_task_submit(&taskA); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_task_submit(&taskB); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_task_submit(&taskC); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_task_submit(&taskD); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* Wait for the termination of all loops */
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	if (loop_cnt < niter)
+		_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	STARPU_ASSERT(check_cnt == (4*loop_cnt));
+
+	starpu_shutdown();
+
+	/* Cleanup the statically allocated tasks after shutdown, as StarPU is still working on it after the callback */
+	starpu_task_deinit(&taskA);
+	starpu_task_deinit(&taskB);
+	starpu_task_deinit(&taskC);
+	starpu_task_deinit(&taskD);
+
+	return EXIT_SUCCESS;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}
+

+ 115 - 0
tests/main/submit.c

@@ -0,0 +1,115 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <sys/time.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <starpu.h>
+#include "../helper.h"
+
+static int i = 0, j;
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+	i++;
+	FPRINTF(stdout, "called third task, i = %d\n", i);
+}
+
+static struct starpu_codelet dummy_codelet = 
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {dummy_func, NULL},
+	.cuda_funcs = {dummy_func, NULL},
+	.opencl_funcs = {dummy_func, NULL},
+	.model = NULL,
+	.nbuffers = 0
+};
+
+static void callback(void *arg __attribute__ ((unused)))
+{
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &dummy_codelet;
+	task->detach = 1;
+	if (starpu_task_submit(task) == ENODEV)
+		exit(STARPU_TEST_SKIPPED);
+	FPRINTF(stdout, "submitted third task, i = %d\n", i);
+}
+
+static struct starpu_codelet callback_submit_codelet = 
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {dummy_func, NULL},
+	.cuda_funcs = {dummy_func, NULL},
+	.opencl_funcs = {dummy_func, NULL},
+	.model = NULL,
+	.nbuffers = 0
+};
+
+static void task_submit_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &callback_submit_codelet;
+	task->callback_func = callback;
+	task->detach = 1;
+	if (starpu_task_submit(task) == ENODEV)
+		exit(STARPU_TEST_SKIPPED);
+	i++;
+	FPRINTF(stdout, "submitted second task, i = %d\n", i);
+}
+
+static struct starpu_codelet task_submit_codelet = 
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {task_submit_func, NULL},
+	.cuda_funcs = {task_submit_func, NULL},
+	.opencl_funcs = {task_submit_func, NULL},
+	.model = NULL,
+	.nbuffers = 0
+};
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &task_submit_codelet;
+	task->detach = 1;
+
+	ret = starpu_task_submit(task);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	starpu_task_wait_for_all();
+	j = i;
+
+	starpu_shutdown();
+
+	return j == 3 ? EXIT_SUCCESS : EXIT_FAILURE;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 6 - 16
tests/microbenchs/async_tasks_overhead.c

@@ -76,26 +76,13 @@ static void init_gordon_kernel(void)
 //	STARPU_ASSERT(!ret);
 //}
 
-static struct starpu_conf conf =
-{
-	.sched_policy_name = NULL,
-	.ncpus = -1,
-	.ncuda = -1,
-        .nopencl = -1,
-	.nspus = -1,
-	.use_explicit_workers_bindid = 0,
-	.use_explicit_workers_cuda_gpuid = 0,
-	.use_explicit_workers_opencl_gpuid = 0,
-	.calibrate = 0
-};
-
 static void usage(char **argv)
 {
 	fprintf(stderr, "%s [-i ntasks] [-p sched_policy] [-h]\n", argv[0]);
 	exit(-1);
 }
 
-static void parse_args(int argc, char **argv)
+static void parse_args(int argc, char **argv, struct starpu_conf *conf)
 {
 	int c;
 	while ((c = getopt(argc, argv, "i:p:h")) != -1)
@@ -105,7 +92,7 @@ static void parse_args(int argc, char **argv)
 			ntasks = atoi(optarg);
 			break;
 		case 'p':
-			conf.sched_policy_name = optarg;
+			conf->sched_policy_name = optarg;
 			break;
 		case 'h':
 			usage(argv);
@@ -121,7 +108,10 @@ int main(int argc, char **argv)
 	struct timeval start;
 	struct timeval end;
 
-	parse_args(argc, argv);
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	parse_args(argc, argv, &conf);
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 224 - 0
tests/microbenchs/matrix_as_vector.c

@@ -0,0 +1,224 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+#include <sys/time.h>
+
+#ifdef STARPU_USE_CUDA
+#  include <cublas.h>
+#endif
+
+#define LOOPS 100
+
+void vector_cpu_func(void *descr[], void *cl_arg __attribute__((unused)))
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	int nx = STARPU_VECTOR_GET_NX(descr[0]);
+	int i;
+	float sum=0;
+
+	for(i=0 ; i<nx ; i++) sum+=i;
+	matrix[0] = sum/nx;
+}
+
+void vector_cuda_func(void *descr[], void *cl_arg __attribute__((unused)))
+{
+#ifdef STARPU_USE_CUDA
+	STARPU_SKIP_IF_VALGRIND;
+
+	float *matrix = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	int nx = STARPU_VECTOR_GET_NX(descr[0]);
+
+	float sum = cublasSasum(nx, matrix, 1);
+	cudaThreadSynchronize();
+	sum /= nx;
+
+	cudaMemcpy(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+#endif /* STARPU_USE_CUDA */
+}
+
+void matrix_cpu_func(void *descr[], void *cl_arg __attribute__((unused)))
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	float *matrix = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	int nx = STARPU_MATRIX_GET_NX(descr[0]);
+	int ny = STARPU_MATRIX_GET_NY(descr[0]);
+	int i;
+	float sum=0;
+
+	for(i=0 ; i<nx*ny ; i++) sum+=i;
+	matrix[0] = sum / (nx*ny);
+}
+
+void matrix_cuda_func(void *descr[], void *cl_arg __attribute__((unused)))
+{
+#ifdef STARPU_USE_CUDA
+	STARPU_SKIP_IF_VALGRIND;
+
+	float *matrix = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	int nx = STARPU_MATRIX_GET_NX(descr[0]);
+	int ny = STARPU_MATRIX_GET_NY(descr[0]);
+
+	float sum = cublasSasum(nx*ny, matrix, 1);
+	cudaThreadSynchronize();
+	sum /= nx*ny;
+
+	cudaMemcpy(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+#endif /* STARPU_USE_CUDA */
+}
+
+int check_size(int nx, struct starpu_codelet *vector_codelet, struct starpu_codelet *matrix_codelet, char *device_name)
+{
+	float *matrix, mean;
+	starpu_data_handle_t vector_handle, matrix_handle;
+	int ret, i, loop;
+	double vector_timing, matrix_timing;
+	struct timeval start;
+	struct timeval end;
+
+	matrix = malloc(nx*sizeof(matrix[0]));
+
+	gettimeofday(&start, NULL);
+	for(loop=1 ; loop<=LOOPS ; loop++)
+	{
+		for(i=0 ; i<nx ; i++) matrix[i] = i;
+		starpu_vector_data_register(&vector_handle, 0, (uintptr_t)matrix, nx, sizeof(matrix[0]));
+		ret = starpu_insert_task(vector_codelet, STARPU_RW, vector_handle, 0);
+		starpu_data_unregister(vector_handle);
+		if (ret == -ENODEV) return ret;
+	}
+	gettimeofday(&end, NULL);
+
+	vector_timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	vector_timing /= LOOPS;
+	mean = matrix[0];
+
+	gettimeofday(&start, NULL);
+	for(loop=1 ; loop<=LOOPS ; loop++)
+	{
+		for(i=0 ; i<nx ; i++) matrix[i] = i;
+		starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix, nx/2, nx/2, 2, sizeof(matrix[0]));
+		ret = starpu_insert_task(matrix_codelet, STARPU_RW, matrix_handle, 0);
+		starpu_data_unregister(matrix_handle);
+		if (ret == -ENODEV) return ret;
+	}
+	gettimeofday(&end, NULL);
+
+	matrix_timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	matrix_timing /= LOOPS;
+
+	if (mean == matrix[0])
+	{
+		fprintf(stderr, "%d\t%f\t%f\n", nx, vector_timing, matrix_timing);
+
+		{
+			char *output_dir = getenv("STARPU_BENCH_DIR");
+			char *bench_id = getenv("STARPU_BENCH_ID");
+			if (output_dir && bench_id)
+			{
+				char file[1024];
+				FILE *f;
+				sprintf(file, "%s/matrix_as_vector_%s.dat", output_dir, device_name);
+				f = fopen(file, "a");
+				fprintf(f, "%s\t%d\t%f\t%f\n", bench_id, nx, vector_timing, matrix_timing);
+				fclose(f);
+			}
+		}
+
+		return EXIT_SUCCESS;
+	}
+	else
+	{
+		FPRINTF(stderr, "Incorrect result nx=%7d --> mean=%7f != %7f\n", nx, matrix[0], mean);
+		return EXIT_FAILURE;
+	}
+}
+
+#define NX_MIN 1024
+#define NX_MAX 1024*1024
+
+int check_size_on_device(uint32_t where, char *device_name)
+{
+	int nx, ret;
+	struct starpu_codelet vector_codelet;
+	struct starpu_codelet matrix_codelet;
+
+	fprintf(stderr, "# Device: %s\n", device_name);
+	fprintf(stderr, "# nx vector_timing matrix_timing\n");
+	starpu_codelet_init(&vector_codelet);
+	vector_codelet.modes[0] = STARPU_RW;
+	vector_codelet.nbuffers = 1;
+	if (where == STARPU_CPU) vector_codelet.cpu_funcs[0] = vector_cpu_func;
+	if (where == STARPU_CUDA) vector_codelet.cuda_funcs[0] = vector_cuda_func;
+//	if (where == STARPU_OPENCL) vector_codelet.opencl_funcs[0] = vector_opencl_func;
+
+	starpu_codelet_init(&matrix_codelet);
+	matrix_codelet.modes[0] = STARPU_RW;
+	matrix_codelet.nbuffers = 1;
+	if (where == STARPU_CPU) matrix_codelet.cpu_funcs[0] = matrix_cpu_func;
+	if (where == STARPU_CUDA) matrix_codelet.cuda_funcs[0] = matrix_cuda_func;
+//	if (where == STARPU_OPENCL) matrix_codelet.opencl_funcs[0] = matrix_opencl_func;
+
+	for(nx=NX_MIN ; nx<=NX_MAX ; nx*=2)
+	{
+		ret = check_size(nx, &vector_codelet, &matrix_codelet, device_name);
+		if (ret != EXIT_SUCCESS) break;
+	}
+	return ret;
+
+};
+
+int main(int argc, char **argv)
+{
+	int ret;
+	unsigned devices;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	devices = starpu_cpu_worker_get_count();
+	if (devices)
+	{
+		ret = check_size_on_device(STARPU_CPU, "STARPU_CPU");
+		if (ret) goto error;
+	}
+	devices = starpu_cuda_worker_get_count();
+	if (devices)
+	{
+		starpu_helper_cublas_init();
+		ret = check_size_on_device(STARPU_CUDA, "STARPU_CUDA");
+		starpu_helper_cublas_shutdown();
+		if (ret) goto error;
+	}
+	devices = starpu_opencl_worker_get_count();
+	if (devices)
+	{
+		ret = check_size_on_device(STARPU_OPENCL, "STARPU_OPENCL");
+		if (ret) goto error;
+	}
+
+error:
+	if (ret == -ENODEV) ret=STARPU_TEST_SKIPPED;
+	starpu_shutdown();
+	STARPU_RETURN(ret);
+}

+ 5 - 13
tests/parallel_tasks/explicit_combined_worker.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -57,18 +57,10 @@ int main(int argc, char **argv)
 	unsigned *v;
 	int ret;
 
-//      struct starpu_conf conf =
-//	{
-//                .sched_policy_name = "pheft",
-//                .ncpus = -1,
-//                .ncuda = -1,
-//                .nopencl = -1,
-//                .nspus = -1,
-//                .use_explicit_workers_bindid = 0,
-//                .use_explicit_workers_cuda_gpuid = 0,
-//                .use_explicit_workers_opencl_gpuid = 0,
-//                .calibrate = -1
-//        };
+//      struct starpu_conf conf;
+//      starpu_conf_init(&conf);
+//      conf.sched_policy_name = "pheft";
+//      conf.calibrate = 1;
 
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 4 - 12
tests/parallel_tasks/parallel_kernels.c

@@ -64,18 +64,10 @@ int main(int argc, char **argv)
 	starpu_data_handle_t v_handle;
 	unsigned *v;
 
-        struct starpu_conf conf =
-	{
-                .sched_policy_name = "pheft",
-                .ncpus = -1,
-                .ncuda = -1,
-                .nopencl = -1,
-                .nspus = -1,
-                .use_explicit_workers_bindid = 0,
-                .use_explicit_workers_cuda_gpuid = 0,
-                .use_explicit_workers_opencl_gpuid = 0,
-                .calibrate = 1
-        };
+        struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.sched_policy_name = "pheft";
+	conf.calibrate = 1;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 4 - 12
tests/parallel_tasks/parallel_kernels_spmd.c

@@ -66,18 +66,10 @@ int main(int argc, char **argv)
 	starpu_data_handle_t v_handle;
 	unsigned *v;
 
-        struct starpu_conf conf =
-	{
-                .sched_policy_name = "pheft",
-                .ncpus = -1,
-                .ncuda = -1,
-                .nopencl = -1,
-                .nspus = -1,
-                .use_explicit_workers_bindid = 0,
-                .use_explicit_workers_cuda_gpuid = 0,
-                .use_explicit_workers_opencl_gpuid = 0,
-                .calibrate = 1
-        };
+        struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.sched_policy_name = "pheft";
+	conf.calibrate = 1;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 2 - 2
tools/starpu_perfmodel_plot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -321,7 +321,7 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 	fprintf(gnuplot_file, "set term postscript eps enhanced color\n");
 	fprintf(gnuplot_file, "set output \"starpu_%s.eps\"\n", symbol);
 	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", symbol);
-	fprintf(gnuplot_file, "set xlabel \"Size\"\n");
+	fprintf(gnuplot_file, "set xlabel \"Total data size\"\n");
 	fprintf(gnuplot_file, "set ylabel \"Time (ms)\"\n");
 	fprintf(gnuplot_file, "\n");
 	fprintf(gnuplot_file, "set key top left\n");

+ 3 - 3
tools/starpu_workers_activity.in

@@ -22,11 +22,11 @@ usage()
 {
     echo "Offline tool to display the activity of the workers during the execution."
     echo ""
-    echo "  The fxt_tool utility now generates a file named 'activity.data' which can"
-    echo "  be processed by this script to generate a plot named activity.eps"
+    echo "  The starpu_fxt_tool utility now generates a file named 'activity.data' which"
+    echo "  can be processed by this script to generate a plot named activity.eps"
     echo ""
     echo "  Typical usage:"
-    echo "     ./fxt_tool -i /tmp/prof_file_foo"
+    echo "     ./starpu_fxt_tool -i /tmp/prof_file_foo"
     echo "     $PROGNAME activity.data"
     echo ""
     echo "Options:"