13 years ago · acce0b71ce
--- a/configure.ac
+++ b/configure.ac
@@ -1312,6 +1312,7 @@ fi
 
				 AC_MSG_CHECKING(whether OpenGL rendering is enabled)
			
 
				 AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
			
 
				 AC_MSG_RESULT($enable_opengl_render)
			
 
				+AM_CONDITIONAL([HAVE_OPENGL], [test "x$enable_opengl_render" = xyes])
			
 
				 
			
 
				 AC_PATH_XTRA
			
 
				 if test "x$no_x" != "xyes"; then
			
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -913,8 +913,22 @@ Graphical-oriented applications need to draw the result of their computations,
 
				 typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
			
 
				 interoperability permit to let CUDA directly work on the OpenGL buffers, making
			
 
				 them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
			
 
				-renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
			
 
				-be given the CUDA pointer at registration, for instance:
			
 
				+renderbuffer objects into CUDA.  CUDA however imposes some technical
			
 
				+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
			
 
				+to be the one that runs CUDA computations for that GPU.
			
 
				+
			
 
				+To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
			
 
				+to @code{./configure} (TODO: make it dynamic), the interoperability mode has to
			
 
				+be enabled by using the @code{cuda_opengl_interoperability} field of the
			
 
				+@code{starpu_conf} structure, and the driver loop has to be run by
			
 
				+the application, by using the @code{not_launched_drivers} field of
			
 
				+@code{starpu_conf} to prevent StarPU from running it in a separate thread, and
			
 
				+by using @code{starpu_run_driver} to run the loop. The @code{gl_interop} example
			
 
				+shows how it articulates in a simple case, where rendering is done in task
			
 
				+callbacks. TODO: provide glutIdleFunc alternative.
			
 
				+
			
 
				+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
			
 
				+the CUDA pointer at registration, for instance:
			
 
				 
			
 
				 @cartouche
			
 
				 @smallexample
			
@@ -922,21 +936,15 @@ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
 
				         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
			
 
				                 break;
			
 
				 
			
 
				-cudaSetDevice(starpu_worker_get_devid(workerid));
			
 
				 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
			
 
				 starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
			
 
				 
			
 
				 starpu_insert_task(&cl, STARPU_RW, handle, 0);
			
 
				-
			
 
				-starpu_data_unregister(handle);
			
 
				-
			
 
				-cudaSetDevice(starpu_worker_get_devid(workerid));
			
 
				-cudaGraphicsUnmapResources(1, &resource, 0);
			
 
				-
			
 
				-/* Now display it */
			
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				+and display it e.g. in the callback function.
			
 
				+
			
 
				 @node More examples
			
 
				 @section More examples
			
 
				 
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -141,6 +141,15 @@ The AMD implementation of OpenCL is known to
 
				 fail when copying data asynchronously. When using this implementation,
			
 
				 it is therefore necessary to disable asynchronous data transfers.
			
 
				 
			
 
				+@item @code{int *cuda_opengl_interoperability} (default = NULL)
			
 
				+This can be set to an array of CUDA device identifiers for which
			
 
				+@code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
			
 
				+size is specified by the @code{n_cuda_opengl_interoperability} field below
			
 
				+
			
 
				+@item @code{int *n_cuda_opengl_interoperability} (default = 0)
			
 
				+This has to be set to the size of the array pointed to by the
			
 
				+@code{cuda_opengl_interoperability} field.
			
 
				+
			
 
				 @item @code{struct starpu_driver *not_launched_drivers}
			
 
				 The drivers that should not be launched by StarPU.
			
 
				 
			
@@ -2158,6 +2167,12 @@ successfull. It returns 0 if the synchronous copy was successful, or
 
				 fails otherwise.
			
 
				 @end deftypefun
			
 
				 
			
 
				+@deftypefun void starpu_cuda_set_device (int@var{devid})
			
 
				+Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
			
 
				+whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
			
 
				+the @code{starpu_conf} structure.
			
 
				+@end deftypefun
			
 
				+
			
 
				 @deftypefun void starpu_helper_cublas_init (void)
			
 
				 This function initializes CUBLAS on every CUDA device.
			
 
				 The CUBLAS library must be initialized prior to any CUBLAS call. Calling
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -826,6 +826,21 @@ pi_pi_redux_LDADD =				\
 
				 	$(STARPU_CURAND_LDFLAGS)
			
 
				 endif
			
 
				 
			
 
				+###########################
			
 
				+# OpenGL interoperability #
			
 
				+###########################
			
 
				+
			
 
				+if HAVE_OPENGL
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	gl_interop/gl_interop
			
 
				+
			
 
				+gl_interop_gl_interop_SOURCES =			\
			
 
				+	gl_interop/gl_interop.c
			
 
				+
			
 
				+gl_interop_gl_interop_LDADD =			\
			
 
				+	$(STARPU_OPENGL_RENDER_LDFLAGS)
			
 
				+endif
			
 
				+
			
 
				 showcheck:
			
 
				 	-cat $(TEST_LOGS) /dev/null
			
 
				 	for i in $(SUBDIRS) ; do \
			
--- a/examples/gl_interop/gl_interop.c
+++ b/examples/gl_interop/gl_interop.c
@@ -0,0 +1,130 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example demonstrates how to use StarPU combined with OpenGL rendering,
			
 
				+ * which needs:
			
 
				+ *
			
 
				+ * - initializing GLUT first,
			
 
				+ * - enabling it at initialization,
			
 
				+ * - running the corresponding CUDA worker in the GLUT thread (here, the main
			
 
				+ *   thread).
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <unistd.h>
			
 
				+#include <GL/glut.h>
			
 
				+
			
 
				+void dummy(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	printf("Codelet running\n");
			
 
				+	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
			
 
				+	printf("Codelet done\n");
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet cl = {
			
 
				+	.where = STARPU_CUDA,
			
 
				+	.cuda_funcs = { dummy, NULL },
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_W },
			
 
				+};
			
 
				+
			
 
				+void foo(void) {
			
 
				+}
			
 
				+
			
 
				+void display(float i) {
			
 
				+	glClear(GL_COLOR_BUFFER_BIT);
			
 
				+	glColor3f(1, 1, 1);
			
 
				+	glBegin(GL_LINES);
			
 
				+	glVertex2f(-i, -i);
			
 
				+	glVertex2f(i, i);
			
 
				+	glEnd();
			
 
				+	glFinish();
			
 
				+	glutPostRedisplay();
			
 
				+	glutMainLoopEvent();
			
 
				+}
			
 
				+
			
 
				+void callback_func(void *foo) {
			
 
				+	printf("Callback running, rendering\n");
			
 
				+	float i = 1.;
			
 
				+	while (i > 0) {
			
 
				+		usleep(100000);
			
 
				+		display(i);
			
 
				+		i -= 0.1;
			
 
				+	}
			
 
				+	printf("rendering done\n");
			
 
				+
			
 
				+	/* Tell it was already the last submitted task */
			
 
				+	starpu_set_end_of_submissions();
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+#if !(defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
			
 
				+	return 77;
			
 
				+#else
			
 
				+	struct starpu_conf conf;
			
 
				+	int cuda_device = 0;
			
 
				+	int cuda_devices[] = { cuda_device };
			
 
				+	struct starpu_driver drivers[] = {
			
 
				+		{ .type = STARPU_CUDA_WORKER, .id.cuda_id = cuda_device }
			
 
				+	};
			
 
				+	int ret;
			
 
				+	struct starpu_task *task;
			
 
				+	starpu_data_handle_t handle;
			
 
				+
			
 
				+	glutInit(&argc, argv);
			
 
				+	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
			
 
				+	glutInitWindowPosition(0, 0);
			
 
				+	glutInitWindowSize(300,200);
			
 
				+	glutCreateWindow("StarPU OpenGL interoperability test");
			
 
				+	glClearColor (0.5, 0.5, 0.5, 0.0);
			
 
				+
			
 
				+	/* Enable OpenGL interoperability */
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.ncuda = 1;
			
 
				+	conf.ncpus = 0;
			
 
				+	conf.nopencl = 0;
			
 
				+	conf.cuda_opengl_interoperability = cuda_devices;
			
 
				+	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
			
 
				+	conf.not_launched_drivers = drivers;
			
 
				+	conf.n_not_launched_drivers = sizeof(drivers) / sizeof(*drivers);
			
 
				+	ret = starpu_init(&conf);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_vector_data_register(&handle, -1, 0, 10, sizeof(float));
			
 
				+
			
 
				+	/* Submit just one dumb task */
			
 
				+	task = starpu_task_create();
			
 
				+	task->cl = &cl;
			
 
				+	task->handles[0] = handle;
			
 
				+	task->callback_func = callback_func;
			
 
				+	task->callback_arg = NULL;
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	/* And run the driver, which will run the task */
			
 
				+	printf("running the driver\n");
			
 
				+	starpu_run_driver(&drivers[0]);
			
 
				+	printf("finished running the driver\n");
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+#endif
			
 
				+}
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -121,6 +121,10 @@ struct starpu_conf
 
				         /* indicate if the asynchronous copies should be disabled */
			
 
				 	int disable_asynchronous_copy;
			
 
				 
			
 
				+	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
			
 
				+	int *cuda_opengl_interoperability;
			
 
				+	unsigned n_cuda_opengl_interoperability;
			
 
				+
			
 
				 	/* A driver that the application will run in one of its own threads. */
			
 
				 	struct starpu_driver *not_launched_drivers;
			
 
				 	unsigned n_not_launched_drivers;
			
--- a/include/starpu_cuda.h
+++ b/include/starpu_cuda.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -46,6 +46,8 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 
				 
			
 
				 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
			
 
				 
			
 
				+void starpu_cuda_set_device(int devid);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -92,8 +92,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				 	size_t size = SIZE;
			
 
				 
			
 
				-	/* Initiliaze CUDA context on the device */
			
 
				-	cudaSetDevice(dev);
			
 
				+	/* Initialize CUDA context on the device */
			
 
				+	starpu_cuda_set_device(dev);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
@@ -185,8 +185,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
			
 
				 
			
 
				-	/* Initiliaze CUDA context on the source */
			
 
				-	cudaSetDevice(src);
			
 
				+	/* Initialize CUDA context on the source */
			
 
				+	starpu_cuda_set_device(src);
			
 
				 
			
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *s_buffer;
			
@@ -194,8 +194,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	STARPU_ASSERT(s_buffer);
			
 
				 	cudaMemset(s_buffer, 0, size);
			
 
				 
			
 
				-	/* Initiliaze CUDA context on the destination */
			
 
				-	cudaSetDevice(dst);
			
 
				+	/* Initialize CUDA context on the destination */
			
 
				+	starpu_cuda_set_device(dst);
			
 
				 
			
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *d_buffer;
			
@@ -222,7 +222,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 
			
 
				 	/* Free buffers */
			
 
				 	cudaFree(d_buffer);
			
 
				-	cudaSetDevice(src);
			
 
				+	starpu_cuda_set_device(src);
			
 
				 	cudaFree(s_buffer);
			
 
				 
			
 
				 	cudaThreadExit();
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -596,10 +596,15 @@ int starpu_task_wait_for_no_ready(void)
 
				 
			
 
				 void _starpu_decrement_nsubmitted_tasks(void)
			
 
				 {
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
			
 
				 
			
 
				-	if (--nsubmitted == 0)
			
 
				+	if (--nsubmitted == 0) {
			
 
				+		if (!config->submitting)
			
 
				+			config->running = 0;
			
 
				 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
			
 
				+	}
			
 
				 
			
 
				 	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
			
 
				 
			
@@ -607,6 +612,27 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
				 
			
 
				 }
			
 
				 
			
 
				+void
			
 
				+starpu_set_end_of_submissions(void)
			
 
				+{
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
			
 
				+
			
 
				+	config->submitting = 0;
			
 
				+	if (nsubmitted == 0) {
			
 
				+		config->running = 0;
			
 
				+		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
			
 
				+	}
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
			
 
				+}
			
 
				+
			
 
				+void _starpu_check_nsubmitted_tasks(void)
			
 
				+{
			
 
				+
			
 
				+}
			
 
				+
			
 
				 static void _starpu_increment_nsubmitted_tasks(void)
			
 
				 {
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -249,6 +249,7 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 
				 static void _starpu_launch_drivers(struct _starpu_machine_config *config)
			
 
				 {
			
 
				 	config->running = 1;
			
 
				+	config->submitting = 1;
			
 
				 
			
 
				 	pthread_key_create(&worker_key, NULL);
			
 
				 
			
@@ -424,6 +425,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 	if (!conf)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				+	memset(conf, 0, sizeof(*conf));
			
 
				 	conf->magic = 42;
			
 
				 	conf->sched_policy_name = getenv("STARPU_SCHED");
			
 
				 	conf->sched_policy = NULL;
			
@@ -968,15 +970,6 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 
				 	config.workers[workerid].sched_mutex = sched_mutex;
			
 
				 }
			
 
				 
			
 
				-void
			
 
				-starpu_set_end_of_submissions(void)
			
 
				-{
			
 
				-	struct _starpu_machine_config *config;
			
 
				-	config = _starpu_get_machine_config();
			
 
				-	starpu_task_wait_for_all();
			
 
				-	config->running = 0;
			
 
				-}
			
 
				-
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 extern int _starpu_run_cuda(struct starpu_driver *);
			
 
				 #endif
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -160,6 +160,9 @@ struct _starpu_machine_config
 
				 
			
 
				 	/* this flag is set until the runtime is stopped */
			
 
				 	unsigned running;
			
 
				+
			
 
				+	/* this flag is set until the application is finished submitting tasks */
			
 
				+	unsigned submitting;
			
 
				 };
			
 
				 
			
 
				 /* Has starpu_shutdown already been called ? */
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -117,8 +117,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
			
 
				 	{
			
 
				 		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
			
 
				-		cures = cudaSetDevice(_starpu_memory_node_to_devid(node));
			
 
				-		STARPU_ASSERT(cures == cudaSuccess);
			
 
				+		starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
			
 
				 	}
			
 
				 #endif
			
 
				 
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -456,16 +456,14 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
				 	/* That code is not even working!! */
			
 
				 	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
			
 
				 
			
 
				-	cures = cudaSetDevice(src_dev);
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	starpu_cuda_set_device(src_dev);
			
 
				 
			
 
				 	struct cudaPitchedPtr mem_device1;
			
 
				 	cures = cudaMalloc3D(&mem_device1, extent);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	cures = cudaSetDevice(dst_dev);
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	starpu_cuda_set_device(dst_dev);
			
 
				 
			
 
				 	struct cudaPitchedPtr mem_device2;
			
 
				 	cures = cudaMalloc3D(&mem_device2, extent);
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -249,8 +249,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, uint32_t node)
 
				 			 * proper CUDA device in case it is needed. This avoids
			
 
				 			 * having to set it again in the free method of each
			
 
				 			 * interface. */
			
 
				-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(node));
			
 
				-			STARPU_ASSERT(err == cudaSuccess);
			
 
				+			starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
			
 
				 		}
			
 
				 #endif
			
 
				 
			
@@ -792,8 +791,7 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _s
 
				 			 * proper CUDA device in case it is needed. This avoids
			
 
				 			 * having to set it again in the malloc method of each
			
 
				 			 * interface. */
			
 
				-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(dst_node));
			
 
				-			STARPU_ASSERT(err == cudaSuccess);
			
 
				+			starpu_cuda_set_device(_starpu_memory_node_to_devid(dst_node));
			
 
				 		}
			
 
				 #endif
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -26,6 +26,7 @@
 
				 #include <drivers/driver_common/driver_common.h>
			
 
				 #include "driver_cuda.h"
			
 
				 #include <core/sched_policy.h>
			
 
				+#include <cuda_gl_interop.h>
			
 
				 
			
 
				 /* the number of CUDA devices */
			
 
				 static int ncudagpus;
			
@@ -108,14 +109,38 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 
				 	return &props[devid];
			
 
				 }
			
 
				 
			
 
				-static void init_context(int devid)
			
 
				+void starpu_cuda_set_device(int devid)
			
 
				 {
			
 
				 	cudaError_t cures;
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				+	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
			
 
				+	unsigned i;
			
 
				+
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	if (conf->n_cuda_opengl_interoperability) {
			
 
				+		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
			
 
				+		STARPU_ASSERT(0);
			
 
				+	}
			
 
				+#else
			
 
				+	for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
			
 
				+		if (conf->cuda_opengl_interoperability[i] == devid) {
			
 
				+			cures = cudaGLSetGLDevice(devid);
			
 
				+			goto done;
			
 
				+		}
			
 
				+#endif
			
 
				 
			
 
				 	cures = cudaSetDevice(devid);
			
 
				+
			
 
				+done:
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+}
			
 
				+
			
 
				+static void init_context(int devid)
			
 
				+{
			
 
				+	cudaError_t cures;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	starpu_cuda_set_device(devid);
			
 
				 
			
 
				 	/* force CUDA to initialize the context for real */
			
 
				 	cures = cudaFree(0);
			
@@ -231,9 +256,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
				 
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 	/* We make sure we do manipulate the proper device */
			
 
				-	cures = cudaSetDevice(args->devid);
			
 
				-	if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	starpu_cuda_set_device(args->devid);
			
 
				 #endif
			
 
				 
			
 
				 	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
			
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2012 inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -95,7 +95,7 @@ test_cuda(void)
 
				 	size = 10 * n;
			
 
				 
			
 
				 	devid = starpu_worker_get_devid(chosen);
			
 
				-	cudaSetDevice(devid);
			
 
				+	starpu_cuda_set_device(devid);
			
 
				 	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
			
 
				 
			
 
				 	foo = calloc(size, sizeof(*foo));
			
@@ -133,7 +133,7 @@ test_cuda(void)
 
				 	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
			
 
				 	starpu_data_unregister(handle);
			
 
				 
			
 
				-	cudaSetDevice(devid);
			
 
				+	starpu_cuda_set_device(devid);
			
 
				 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
--- a/tests/experiments/latency/cuda_latency.c
+++ b/tests/experiments/latency/cuda_latency.c
@@ -113,7 +113,7 @@ void *launch_gpu_thread(void *arg)
 
				 	unsigned *idptr = arg;
			
 
				 	unsigned id = *idptr;
			
 
				 
			
 
				-	cudaSetDevice(id);
			
 
				+	starpu_cuda_set_device(id);
			
 
				 	cudaFree(0);
			
 
				 
			
 
				 	cudaMalloc(&gpu_buffer[id], buffer_size);