13 years ago · acce0b71ce
--- a/configure.ac
+++ b/configure.ac
@@ -1312,6 +1312,7 @@ fi
 
																 AC_MSG_CHECKING(whether OpenGL rendering is enabled)
															
 
																 AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
															
 
																 AC_MSG_RESULT($enable_opengl_render)
															
 
																+AM_CONDITIONAL([HAVE_OPENGL], [test "x$enable_opengl_render" = xyes])
															
 
																 AC_PATH_XTRA
															
 
																 if test "x$no_x" != "xyes"; then
															
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -913,8 +913,22 @@ Graphical-oriented applications need to draw the result of their computations,
 
																 typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
															
 
																 interoperability permit to let CUDA directly work on the OpenGL buffers, making
															
 
																 them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
															
 
																-renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
															
 
																-be given the CUDA pointer at registration, for instance:
															
 
																+renderbuffer objects into CUDA.  CUDA however imposes some technical
															
 
																+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
															
 
																+to be the one that runs CUDA computations for that GPU.
															
 
																+
															
 
																+To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
															
 
																+to @code{./configure} (TODO: make it dynamic), the interoperability mode has to
															
 
																+be enabled by using the @code{cuda_opengl_interoperability} field of the
															
 
																+@code{starpu_conf} structure, and the driver loop has to be run by
															
 
																+the application, by using the @code{not_launched_drivers} field of
															
 
																+@code{starpu_conf} to prevent StarPU from running it in a separate thread, and
															
 
																+by using @code{starpu_run_driver} to run the loop. The @code{gl_interop} example
															
 
																+shows how it articulates in a simple case, where rendering is done in task
															
 
																+callbacks. TODO: provide glutIdleFunc alternative.
															
 
																+
															
 
																+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
															
 
																+the CUDA pointer at registration, for instance:
															
 
																 @cartouche
															
 
																 @smallexample
															
@@ -922,21 +936,15 @@ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
 
																         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
															
 
																                 break;
															
 
																-cudaSetDevice(starpu_worker_get_devid(workerid));
															
 
																 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
															
 
																 starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
															
 
																 starpu_insert_task(&cl, STARPU_RW, handle, 0);
															
 
																-
															
 
																-starpu_data_unregister(handle);
															
 
																-
															
 
																-cudaSetDevice(starpu_worker_get_devid(workerid));
															
 
																-cudaGraphicsUnmapResources(1, &resource, 0);
															
 
																-
															
 
																-/* Now display it */
															
 
																 @end smallexample
															
 
																 @end cartouche
															
 
																+and display it e.g. in the callback function.
															
 
																+
															
 
																 @node More examples
															
 
																 @section More examples
															
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -141,6 +141,15 @@ The AMD implementation of OpenCL is known to
 
																 fail when copying data asynchronously. When using this implementation,
															
 
																 it is therefore necessary to disable asynchronous data transfers.
															
 
																+@item @code{int *cuda_opengl_interoperability} (default = NULL)
															
 
																+This can be set to an array of CUDA device identifiers for which
															
 
																+@code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
															
 
																+size is specified by the @code{n_cuda_opengl_interoperability} field below
															
 
																+
															
 
																+@item @code{int *n_cuda_opengl_interoperability} (default = 0)
															
 
																+This has to be set to the size of the array pointed to by the
															
 
																+@code{cuda_opengl_interoperability} field.
															
 
																+
															
 
																 @item @code{struct starpu_driver *not_launched_drivers}
															
 
																 The drivers that should not be launched by StarPU.
															
@@ -2158,6 +2167,12 @@ successfull. It returns 0 if the synchronous copy was successful, or
 
																 fails otherwise.
															
 
																 @end deftypefun
															
 
																+@deftypefun void starpu_cuda_set_device (int@var{devid})
															
 
																+Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
															
 
																+whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
															
 
																+the @code{starpu_conf} structure.
															
 
																+@end deftypefun
															
 
																+
															
 
																 @deftypefun void starpu_helper_cublas_init (void)
															
 
																 This function initializes CUBLAS on every CUDA device.
															
 
																 The CUBLAS library must be initialized prior to any CUBLAS call. Calling
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -826,6 +826,21 @@ pi_pi_redux_LDADD =				\
 
																 	$(STARPU_CURAND_LDFLAGS)
															
 
																 endif
															
 
																+###########################
															
 
																+# OpenGL interoperability #
															
 
																+###########################
															
 
																+
															
 
																+if HAVE_OPENGL
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	gl_interop/gl_interop
															
 
																+
															
 
																+gl_interop_gl_interop_SOURCES =			\
															
 
																+	gl_interop/gl_interop.c
															
 
																+
															
 
																+gl_interop_gl_interop_LDADD =			\
															
 
																+	$(STARPU_OPENGL_RENDER_LDFLAGS)
															
 
																+endif
															
 
																+
															
 
																 showcheck:
															
 
																 	-cat $(TEST_LOGS) /dev/null
															
 
																 	for i in $(SUBDIRS) ; do \
															
--- a/examples/gl_interop/gl_interop.c
+++ b/examples/gl_interop/gl_interop.c
@@ -0,0 +1,130 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2012 Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/*
															
 
																+ * This example demonstrates how to use StarPU combined with OpenGL rendering,
															
 
																+ * which needs:
															
 
																+ *
															
 
																+ * - initializing GLUT first,
															
 
																+ * - enabling it at initialization,
															
 
																+ * - running the corresponding CUDA worker in the GLUT thread (here, the main
															
 
																+ *   thread).
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <unistd.h>
															
 
																+#include <GL/glut.h>
															
 
																+
															
 
																+void dummy(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+
															
 
																+	printf("Codelet running\n");
															
 
																+	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
															
 
																+	printf("Codelet done\n");
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet cl = {
															
 
																+	.where = STARPU_CUDA,
															
 
																+	.cuda_funcs = { dummy, NULL },
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = { STARPU_W },
															
 
																+};
															
 
																+
															
 
																+void foo(void) {
															
 
																+}
															
 
																+
															
 
																+void display(float i) {
															
 
																+	glClear(GL_COLOR_BUFFER_BIT);
															
 
																+	glColor3f(1, 1, 1);
															
 
																+	glBegin(GL_LINES);
															
 
																+	glVertex2f(-i, -i);
															
 
																+	glVertex2f(i, i);
															
 
																+	glEnd();
															
 
																+	glFinish();
															
 
																+	glutPostRedisplay();
															
 
																+	glutMainLoopEvent();
															
 
																+}
															
 
																+
															
 
																+void callback_func(void *foo) {
															
 
																+	printf("Callback running, rendering\n");
															
 
																+	float i = 1.;
															
 
																+	while (i > 0) {
															
 
																+		usleep(100000);
															
 
																+		display(i);
															
 
																+		i -= 0.1;
															
 
																+	}
															
 
																+	printf("rendering done\n");
															
 
																+
															
 
																+	/* Tell it was already the last submitted task */
															
 
																+	starpu_set_end_of_submissions();
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+#if !(defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
															
 
																+	return 77;
															
 
																+#else
															
 
																+	struct starpu_conf conf;
															
 
																+	int cuda_device = 0;
															
 
																+	int cuda_devices[] = { cuda_device };
															
 
																+	struct starpu_driver drivers[] = {
															
 
																+		{ .type = STARPU_CUDA_WORKER, .id.cuda_id = cuda_device }
															
 
																+	};
															
 
																+	int ret;
															
 
																+	struct starpu_task *task;
															
 
																+	starpu_data_handle_t handle;
															
 
																+
															
 
																+	glutInit(&argc, argv);
															
 
																+	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
															
 
																+	glutInitWindowPosition(0, 0);
															
 
																+	glutInitWindowSize(300,200);
															
 
																+	glutCreateWindow("StarPU OpenGL interoperability test");
															
 
																+	glClearColor (0.5, 0.5, 0.5, 0.0);
															
 
																+
															
 
																+	/* Enable OpenGL interoperability */
															
 
																+	starpu_conf_init(&conf);
															
 
																+	conf.ncuda = 1;
															
 
																+	conf.ncpus = 0;
															
 
																+	conf.nopencl = 0;
															
 
																+	conf.cuda_opengl_interoperability = cuda_devices;
															
 
																+	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
															
 
																+	conf.not_launched_drivers = drivers;
															
 
																+	conf.n_not_launched_drivers = sizeof(drivers) / sizeof(*drivers);
															
 
																+	ret = starpu_init(&conf);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+	starpu_vector_data_register(&handle, -1, 0, 10, sizeof(float));
															
 
																+
															
 
																+	/* Submit just one dumb task */
															
 
																+	task = starpu_task_create();
															
 
																+	task->cl = &cl;
															
 
																+	task->handles[0] = handle;
															
 
																+	task->callback_func = callback_func;
															
 
																+	task->callback_arg = NULL;
															
 
																+	ret = starpu_task_submit(task);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+	/* And run the driver, which will run the task */
															
 
																+	printf("running the driver\n");
															
 
																+	starpu_run_driver(&drivers[0]);
															
 
																+	printf("finished running the driver\n");
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+#endif
															
 
																+}
															
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -121,6 +121,10 @@ struct starpu_conf
 
																         /* indicate if the asynchronous copies should be disabled */
															
 
																 	int disable_asynchronous_copy;
															
 
																+	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
															
 
																+	int *cuda_opengl_interoperability;
															
 
																+	unsigned n_cuda_opengl_interoperability;
															
 
																+
															
 
																 	/* A driver that the application will run in one of its own threads. */
															
 
																 	struct starpu_driver *not_launched_drivers;
															
 
																 	unsigned n_not_launched_drivers;
															
--- a/include/starpu_cuda.h
+++ b/include/starpu_cuda.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -46,6 +46,8 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 
																 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
															
 
																+void starpu_cuda_set_device(int devid);
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -92,8 +92,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
																 	_starpu_bind_thread_on_cpu(config, cpu);
															
 
																 	size_t size = SIZE;
															
 
																-	/* Initiliaze CUDA context on the device */
															
 
																-	cudaSetDevice(dev);
															
 
																+	/* Initialize CUDA context on the device */
															
 
																+	starpu_cuda_set_device(dev);
															
 
																 	/* hack to avoid third party libs to rebind threads */
															
 
																 	_starpu_bind_thread_on_cpu(config, cpu);
															
@@ -185,8 +185,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
															
 
																-	/* Initiliaze CUDA context on the source */
															
 
																-	cudaSetDevice(src);
															
 
																+	/* Initialize CUDA context on the source */
															
 
																+	starpu_cuda_set_device(src);
															
 
																 	/* Allocate a buffer on the device */
															
 
																 	unsigned char *s_buffer;
															
@@ -194,8 +194,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	STARPU_ASSERT(s_buffer);
															
 
																 	cudaMemset(s_buffer, 0, size);
															
 
																-	/* Initiliaze CUDA context on the destination */
															
 
																-	cudaSetDevice(dst);
															
 
																+	/* Initialize CUDA context on the destination */
															
 
																+	starpu_cuda_set_device(dst);
															
 
																 	/* Allocate a buffer on the device */
															
 
																 	unsigned char *d_buffer;
															
@@ -222,7 +222,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	/* Free buffers */
															
 
																 	cudaFree(d_buffer);
															
 
																-	cudaSetDevice(src);
															
 
																+	starpu_cuda_set_device(src);
															
 
																 	cudaFree(s_buffer);
															
 
																 	cudaThreadExit();
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -596,10 +596,15 @@ int starpu_task_wait_for_no_ready(void)
 
																 void _starpu_decrement_nsubmitted_tasks(void)
															
 
																 {
															
 
																+	struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																+
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																-	if (--nsubmitted == 0)
															
 
																+	if (--nsubmitted == 0) {
															
 
																+		if (!config->submitting)
															
 
																+			config->running = 0;
															
 
																 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
															
 
																+	}
															
 
																 	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
															
@@ -607,6 +612,27 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
																 }
															
 
																+void
															
 
																+starpu_set_end_of_submissions(void)
															
 
																+{
															
 
																+	struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
 
																+
															
 
																+	config->submitting = 0;
															
 
																+	if (nsubmitted == 0) {
															
 
																+		config->running = 0;
															
 
																+		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
															
 
																+	}
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																+}
															
 
																+
															
 
																+void _starpu_check_nsubmitted_tasks(void)
															
 
																+{
															
 
																+
															
 
																+}
															
 
																+
															
 
																 static void _starpu_increment_nsubmitted_tasks(void)
															
 
																 {
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -249,6 +249,7 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 
																 static void _starpu_launch_drivers(struct _starpu_machine_config *config)
															
 
																 {
															
 
																 	config->running = 1;
															
 
																+	config->submitting = 1;
															
 
																 	pthread_key_create(&worker_key, NULL);
															
@@ -424,6 +425,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
																 	if (!conf)
															
 
																 		return -EINVAL;
															
 
																+	memset(conf, 0, sizeof(*conf));
															
 
																 	conf->magic = 42;
															
 
																 	conf->sched_policy_name = getenv("STARPU_SCHED");
															
 
																 	conf->sched_policy = NULL;
															
@@ -968,15 +970,6 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 
																 	config.workers[workerid].sched_mutex = sched_mutex;
															
 
																 }
															
 
																-void
															
 
																-starpu_set_end_of_submissions(void)
															
 
																-{
															
 
																-	struct _starpu_machine_config *config;
															
 
																-	config = _starpu_get_machine_config();
															
 
																-	starpu_task_wait_for_all();
															
 
																-	config->running = 0;
															
 
																-}
															
 
																-
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 extern int _starpu_run_cuda(struct starpu_driver *);
															
 
																 #endif
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -160,6 +160,9 @@ struct _starpu_machine_config
 
																 	/* this flag is set until the runtime is stopped */
															
 
																 	unsigned running;
															
 
																+
															
 
																+	/* this flag is set until the application is finished submitting tasks */
															
 
																+	unsigned submitting;
															
 
																 };
															
 
																 /* Has starpu_shutdown already been called ? */
															
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -117,8 +117,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
															
 
																 	{
															
 
																 		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
															
 
																-		cures = cudaSetDevice(_starpu_memory_node_to_devid(node));
															
 
																-		STARPU_ASSERT(cures == cudaSuccess);
															
 
																+		starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
															
 
																 	}
															
 
																 #endif
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -456,16 +456,14 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
																 	/* That code is not even working!! */
															
 
																 	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
															
 
																-	cures = cudaSetDevice(src_dev);
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	starpu_cuda_set_device(src_dev);
															
 
																 	struct cudaPitchedPtr mem_device1;
															
 
																 	cures = cudaMalloc3D(&mem_device1, extent);
															
 
																 	if (STARPU_UNLIKELY(cures))
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																-	cures = cudaSetDevice(dst_dev);
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	starpu_cuda_set_device(dst_dev);
															
 
																 	struct cudaPitchedPtr mem_device2;
															
 
																 	cures = cudaMalloc3D(&mem_device2, extent);
															
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -249,8 +249,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, uint32_t node)
 
																 			 * proper CUDA device in case it is needed. This avoids
															
 
																 			 * having to set it again in the free method of each
															
 
																 			 * interface. */
															
 
																-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(node));
															
 
																-			STARPU_ASSERT(err == cudaSuccess);
															
 
																+			starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
															
 
																 		}
															
 
																 #endif
															
@@ -792,8 +791,7 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _s
 
																 			 * proper CUDA device in case it is needed. This avoids
															
 
																 			 * having to set it again in the malloc method of each
															
 
																 			 * interface. */
															
 
																-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(dst_node));
															
 
																-			STARPU_ASSERT(err == cudaSuccess);
															
 
																+			starpu_cuda_set_device(_starpu_memory_node_to_devid(dst_node));
															
 
																 		}
															
 
																 #endif
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -26,6 +26,7 @@
 
																 #include <drivers/driver_common/driver_common.h>
															
 
																 #include "driver_cuda.h"
															
 
																 #include <core/sched_policy.h>
															
 
																+#include <cuda_gl_interop.h>
															
 
																 /* the number of CUDA devices */
															
 
																 static int ncudagpus;
															
@@ -108,14 +109,38 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 
																 	return &props[devid];
															
 
																 }
															
 
																-static void init_context(int devid)
															
 
																+void starpu_cuda_set_device(int devid)
															
 
																 {
															
 
																 	cudaError_t cures;
															
 
																-	int workerid = starpu_worker_get_id();
															
 
																+	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
															
 
																+	unsigned i;
															
 
																+
															
 
																+#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																+	if (conf->n_cuda_opengl_interoperability) {
															
 
																+		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
															
 
																+		STARPU_ASSERT(0);
															
 
																+	}
															
 
																+#else
															
 
																+	for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
															
 
																+		if (conf->cuda_opengl_interoperability[i] == devid) {
															
 
																+			cures = cudaGLSetGLDevice(devid);
															
 
																+			goto done;
															
 
																+		}
															
 
																+#endif
															
 
																 	cures = cudaSetDevice(devid);
															
 
																+
															
 
																+done:
															
 
																 	if (STARPU_UNLIKELY(cures))
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+}
															
 
																+
															
 
																+static void init_context(int devid)
															
 
																+{
															
 
																+	cudaError_t cures;
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	starpu_cuda_set_device(devid);
															
 
																 	/* force CUDA to initialize the context for real */
															
 
																 	cures = cudaFree(0);
															
@@ -231,9 +256,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
																 #ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																 	/* We make sure we do manipulate the proper device */
															
 
																-	cures = cudaSetDevice(args->devid);
															
 
																-	if (STARPU_UNLIKELY(cures != cudaSuccess))
															
 
																-		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+	starpu_cuda_set_device(args->devid);
															
 
																 #endif
															
 
																 	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
															
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011-2012  Université de Bordeaux 1
															
 
																  * Copyright (C) 2012 inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -95,7 +95,7 @@ test_cuda(void)
 
																 	size = 10 * n;
															
 
																 	devid = starpu_worker_get_devid(chosen);
															
 
																-	cudaSetDevice(devid);
															
 
																+	starpu_cuda_set_device(devid);
															
 
																 	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
															
 
																 	foo = calloc(size, sizeof(*foo));
															
@@ -133,7 +133,7 @@ test_cuda(void)
 
																 	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
															
 
																 	starpu_data_unregister(handle);
															
 
																-	cudaSetDevice(devid);
															
 
																+	starpu_cuda_set_device(devid);
															
 
																 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
															
 
																 	if (STARPU_UNLIKELY(cures))
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
--- a/tests/experiments/latency/cuda_latency.c
+++ b/tests/experiments/latency/cuda_latency.c
@@ -113,7 +113,7 @@ void *launch_gpu_thread(void *arg)
 
																 	unsigned *idptr = arg;
															
 
																 	unsigned id = *idptr;
															
 
																-	cudaSetDevice(id);
															
 
																+	starpu_cuda_set_device(id);
															
 
																 	cudaFree(0);
															
 
																 	cudaMalloc(&gpu_buffer[id], buffer_size);