Browse Source

Add OpenGL interoperability support

Samuel Thibault 13 years ago
parent
commit
acce0b71ce

+ 1 - 0
configure.ac

@@ -1312,6 +1312,7 @@ fi
 AC_MSG_CHECKING(whether OpenGL rendering is enabled)
 AC_MSG_CHECKING(whether OpenGL rendering is enabled)
 AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
 AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
 AC_MSG_RESULT($enable_opengl_render)
 AC_MSG_RESULT($enable_opengl_render)
+AM_CONDITIONAL([HAVE_OPENGL], [test "x$enable_opengl_render" = xyes])
 
 
 AC_PATH_XTRA
 AC_PATH_XTRA
 if test "x$no_x" != "xyes"; then
 if test "x$no_x" != "xyes"; then

+ 18 - 10
doc/chapters/advanced-examples.texi

@@ -913,8 +913,22 @@ Graphical-oriented applications need to draw the result of their computations,
 typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
 typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
 interoperability permit to let CUDA directly work on the OpenGL buffers, making
 interoperability permit to let CUDA directly work on the OpenGL buffers, making
 them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
 them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
-renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
-be given the CUDA pointer at registration, for instance:
+renderbuffer objects into CUDA.  CUDA however imposes some technical
+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
+to be the one that runs CUDA computations for that GPU.
+
+To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
+to @code{./configure} (TODO: make it dynamic), the interoperability mode has to
+be enabled by using the @code{cuda_opengl_interoperability} field of the
+@code{starpu_conf} structure, and the driver loop has to be run by
+the application, by using the @code{not_launched_drivers} field of
+@code{starpu_conf} to prevent StarPU from running it in a separate thread, and
+by using @code{starpu_run_driver} to run the loop. The @code{gl_interop} example
+shows how it articulates in a simple case, where rendering is done in task
+callbacks. TODO: provide glutIdleFunc alternative.
+
+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
+the CUDA pointer at registration, for instance:
 
 
 @cartouche
 @cartouche
 @smallexample
 @smallexample
@@ -922,21 +936,15 @@ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
                 break;
                 break;
 
 
-cudaSetDevice(starpu_worker_get_devid(workerid));
 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
 starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
 starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
 
 
 starpu_insert_task(&cl, STARPU_RW, handle, 0);
 starpu_insert_task(&cl, STARPU_RW, handle, 0);
-
-starpu_data_unregister(handle);
-
-cudaSetDevice(starpu_worker_get_devid(workerid));
-cudaGraphicsUnmapResources(1, &resource, 0);
-
-/* Now display it */
 @end smallexample
 @end smallexample
 @end cartouche
 @end cartouche
 
 
+and display it e.g. in the callback function.
+
 @node More examples
 @node More examples
 @section More examples
 @section More examples
 
 

+ 15 - 0
doc/chapters/basic-api.texi

@@ -141,6 +141,15 @@ The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 it is therefore necessary to disable asynchronous data transfers.
 
 
+@item @code{int *cuda_opengl_interoperability} (default = NULL)
+This can be set to an array of CUDA device identifiers for which
+@code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
+size is specified by the @code{n_cuda_opengl_interoperability} field below
+
+@item @code{int *n_cuda_opengl_interoperability} (default = 0)
+This has to be set to the size of the array pointed to by the
+@code{cuda_opengl_interoperability} field.
+
 @item @code{struct starpu_driver *not_launched_drivers}
 @item @code{struct starpu_driver *not_launched_drivers}
 The drivers that should not be launched by StarPU.
 The drivers that should not be launched by StarPU.
 
 
@@ -2158,6 +2167,12 @@ successfull. It returns 0 if the synchronous copy was successful, or
 fails otherwise.
 fails otherwise.
 @end deftypefun
 @end deftypefun
 
 
+@deftypefun void starpu_cuda_set_device (int@var{devid})
+Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
+whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
+the @code{starpu_conf} structure.
+@end deftypefun
+
 @deftypefun void starpu_helper_cublas_init (void)
 @deftypefun void starpu_helper_cublas_init (void)
 This function initializes CUBLAS on every CUDA device.
 This function initializes CUBLAS on every CUDA device.
 The CUBLAS library must be initialized prior to any CUBLAS call. Calling
 The CUBLAS library must be initialized prior to any CUBLAS call. Calling

+ 15 - 0
examples/Makefile.am

@@ -826,6 +826,21 @@ pi_pi_redux_LDADD =				\
 	$(STARPU_CURAND_LDFLAGS)
 	$(STARPU_CURAND_LDFLAGS)
 endif
 endif
 
 
+###########################
+# OpenGL interoperability #
+###########################
+
+if HAVE_OPENGL
+examplebin_PROGRAMS +=				\
+	gl_interop/gl_interop
+
+gl_interop_gl_interop_SOURCES =			\
+	gl_interop/gl_interop.c
+
+gl_interop_gl_interop_LDADD =			\
+	$(STARPU_OPENGL_RENDER_LDFLAGS)
+endif
+
 showcheck:
 showcheck:
 	-cat $(TEST_LOGS) /dev/null
 	-cat $(TEST_LOGS) /dev/null
 	for i in $(SUBDIRS) ; do \
 	for i in $(SUBDIRS) ; do \

+ 130 - 0
examples/gl_interop/gl_interop.c

@@ -0,0 +1,130 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example demonstrates how to use StarPU combined with OpenGL rendering,
+ * which needs:
+ *
+ * - initializing GLUT first,
+ * - enabling it at initialization,
+ * - running the corresponding CUDA worker in the GLUT thread (here, the main
+ *   thread).
+ */
+
+#include <starpu.h>
+#include <unistd.h>
+#include <GL/glut.h>
+
+void dummy(void *buffers[], void *cl_arg)
+{
+	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+
+	printf("Codelet running\n");
+	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
+	printf("Codelet done\n");
+}
+
+struct starpu_codelet cl = {
+	.where = STARPU_CUDA,
+	.cuda_funcs = { dummy, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_W },
+};
+
+void foo(void) {
+}
+
+void display(float i) {
+	glClear(GL_COLOR_BUFFER_BIT);
+	glColor3f(1, 1, 1);
+	glBegin(GL_LINES);
+	glVertex2f(-i, -i);
+	glVertex2f(i, i);
+	glEnd();
+	glFinish();
+	glutPostRedisplay();
+	glutMainLoopEvent();
+}
+
+void callback_func(void *foo) {
+	printf("Callback running, rendering\n");
+	float i = 1.;
+	while (i > 0) {
+		usleep(100000);
+		display(i);
+		i -= 0.1;
+	}
+	printf("rendering done\n");
+
+	/* Tell it was already the last submitted task */
+	starpu_set_end_of_submissions();
+}
+
+int main(int argc, char **argv)
+{
+#if !(defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
+	return 77;
+#else
+	struct starpu_conf conf;
+	int cuda_device = 0;
+	int cuda_devices[] = { cuda_device };
+	struct starpu_driver drivers[] = {
+		{ .type = STARPU_CUDA_WORKER, .id.cuda_id = cuda_device }
+	};
+	int ret;
+	struct starpu_task *task;
+	starpu_data_handle_t handle;
+
+	glutInit(&argc, argv);
+	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
+	glutInitWindowPosition(0, 0);
+	glutInitWindowSize(300,200);
+	glutCreateWindow("StarPU OpenGL interoperability test");
+	glClearColor (0.5, 0.5, 0.5, 0.0);
+
+	/* Enable OpenGL interoperability */
+	starpu_conf_init(&conf);
+	conf.ncuda = 1;
+	conf.ncpus = 0;
+	conf.nopencl = 0;
+	conf.cuda_opengl_interoperability = cuda_devices;
+	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
+	conf.not_launched_drivers = drivers;
+	conf.n_not_launched_drivers = sizeof(drivers) / sizeof(*drivers);
+	ret = starpu_init(&conf);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_vector_data_register(&handle, -1, 0, 10, sizeof(float));
+
+	/* Submit just one dumb task */
+	task = starpu_task_create();
+	task->cl = &cl;
+	task->handles[0] = handle;
+	task->callback_func = callback_func;
+	task->callback_arg = NULL;
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* And run the driver, which will run the task */
+	printf("running the driver\n");
+	starpu_run_driver(&drivers[0]);
+	printf("finished running the driver\n");
+
+	starpu_shutdown();
+
+	return 0;
+#endif
+}

+ 4 - 0
include/starpu.h

@@ -121,6 +121,10 @@ struct starpu_conf
         /* indicate if the asynchronous copies should be disabled */
         /* indicate if the asynchronous copies should be disabled */
 	int disable_asynchronous_copy;
 	int disable_asynchronous_copy;
 
 
+	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
+	int *cuda_opengl_interoperability;
+	unsigned n_cuda_opengl_interoperability;
+
 	/* A driver that the application will run in one of its own threads. */
 	/* A driver that the application will run in one of its own threads. */
 	struct starpu_driver *not_launched_drivers;
 	struct starpu_driver *not_launched_drivers;
 	unsigned n_not_launched_drivers;
 	unsigned n_not_launched_drivers;

+ 3 - 1
include/starpu_cuda.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -46,6 +46,8 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 
 
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 
 
+void starpu_cuda_set_device(int devid);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 7 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -92,8 +92,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
 	size_t size = SIZE;
 	size_t size = SIZE;
 
 
-	/* Initiliaze CUDA context on the device */
-	cudaSetDevice(dev);
+	/* Initialize CUDA context on the device */
+	starpu_cuda_set_device(dev);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -185,8 +185,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
 
 
-	/* Initiliaze CUDA context on the source */
-	cudaSetDevice(src);
+	/* Initialize CUDA context on the source */
+	starpu_cuda_set_device(src);
 
 
 	/* Allocate a buffer on the device */
 	/* Allocate a buffer on the device */
 	unsigned char *s_buffer;
 	unsigned char *s_buffer;
@@ -194,8 +194,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	STARPU_ASSERT(s_buffer);
 	STARPU_ASSERT(s_buffer);
 	cudaMemset(s_buffer, 0, size);
 	cudaMemset(s_buffer, 0, size);
 
 
-	/* Initiliaze CUDA context on the destination */
-	cudaSetDevice(dst);
+	/* Initialize CUDA context on the destination */
+	starpu_cuda_set_device(dst);
 
 
 	/* Allocate a buffer on the device */
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
 	unsigned char *d_buffer;
@@ -222,7 +222,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
 
 	/* Free buffers */
 	/* Free buffers */
 	cudaFree(d_buffer);
 	cudaFree(d_buffer);
-	cudaSetDevice(src);
+	starpu_cuda_set_device(src);
 	cudaFree(s_buffer);
 	cudaFree(s_buffer);
 
 
 	cudaThreadExit();
 	cudaThreadExit();

+ 27 - 1
src/core/task.c

@@ -596,10 +596,15 @@ int starpu_task_wait_for_no_ready(void)
 
 
 void _starpu_decrement_nsubmitted_tasks(void)
 void _starpu_decrement_nsubmitted_tasks(void)
 {
 {
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 
 
-	if (--nsubmitted == 0)
+	if (--nsubmitted == 0) {
+		if (!config->submitting)
+			config->running = 0;
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
+	}
 
 
 	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
 	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
 
 
@@ -607,6 +612,27 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
 
 }
 }
 
 
+void
+starpu_set_end_of_submissions(void)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+
+	config->submitting = 0;
+	if (nsubmitted == 0) {
+		config->running = 0;
+		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+}
+
+void _starpu_check_nsubmitted_tasks(void)
+{
+
+}
+
 static void _starpu_increment_nsubmitted_tasks(void)
 static void _starpu_increment_nsubmitted_tasks(void)
 {
 {
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);

+ 2 - 9
src/core/workers.c

@@ -249,6 +249,7 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 {
 {
 	config->running = 1;
 	config->running = 1;
+	config->submitting = 1;
 
 
 	pthread_key_create(&worker_key, NULL);
 	pthread_key_create(&worker_key, NULL);
 
 
@@ -424,6 +425,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 	if (!conf)
 	if (!conf)
 		return -EINVAL;
 		return -EINVAL;
 
 
+	memset(conf, 0, sizeof(*conf));
 	conf->magic = 42;
 	conf->magic = 42;
 	conf->sched_policy_name = getenv("STARPU_SCHED");
 	conf->sched_policy_name = getenv("STARPU_SCHED");
 	conf->sched_policy = NULL;
 	conf->sched_policy = NULL;
@@ -968,15 +970,6 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 	config.workers[workerid].sched_mutex = sched_mutex;
 	config.workers[workerid].sched_mutex = sched_mutex;
 }
 }
 
 
-void
-starpu_set_end_of_submissions(void)
-{
-	struct _starpu_machine_config *config;
-	config = _starpu_get_machine_config();
-	starpu_task_wait_for_all();
-	config->running = 0;
-}
-
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 extern int _starpu_run_cuda(struct starpu_driver *);
 extern int _starpu_run_cuda(struct starpu_driver *);
 #endif
 #endif

+ 3 - 0
src/core/workers.h

@@ -160,6 +160,9 @@ struct _starpu_machine_config
 
 
 	/* this flag is set until the runtime is stopped */
 	/* this flag is set until the runtime is stopped */
 	unsigned running;
 	unsigned running;
+
+	/* this flag is set until the application is finished submitting tasks */
+	unsigned submitting;
 };
 };
 
 
 /* Has starpu_shutdown already been called ? */
 /* Has starpu_shutdown already been called ? */

+ 1 - 2
src/datawizard/copy_driver.c

@@ -117,8 +117,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
 	{
 	{
 		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
 		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
-		cures = cudaSetDevice(_starpu_memory_node_to_devid(node));
-		STARPU_ASSERT(cures == cudaSuccess);
+		starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
 	}
 	}
 #endif
 #endif
 
 

+ 2 - 4
src/datawizard/interfaces/matrix_interface.c

@@ -456,16 +456,14 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	/* That code is not even working!! */
 	/* That code is not even working!! */
 	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
 	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
 
 
-	cures = cudaSetDevice(src_dev);
-	STARPU_ASSERT(cures == cudaSuccess);
+	starpu_cuda_set_device(src_dev);
 
 
 	struct cudaPitchedPtr mem_device1;
 	struct cudaPitchedPtr mem_device1;
 	cures = cudaMalloc3D(&mem_device1, extent);
 	cures = cudaMalloc3D(&mem_device1, extent);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 
-	cures = cudaSetDevice(dst_dev);
-	STARPU_ASSERT(cures == cudaSuccess);
+	starpu_cuda_set_device(dst_dev);
 
 
 	struct cudaPitchedPtr mem_device2;
 	struct cudaPitchedPtr mem_device2;
 	cures = cudaMalloc3D(&mem_device2, extent);
 	cures = cudaMalloc3D(&mem_device2, extent);

+ 2 - 4
src/datawizard/memalloc.c

@@ -249,8 +249,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, uint32_t node)
 			 * proper CUDA device in case it is needed. This avoids
 			 * proper CUDA device in case it is needed. This avoids
 			 * having to set it again in the free method of each
 			 * having to set it again in the free method of each
 			 * interface. */
 			 * interface. */
-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(node));
-			STARPU_ASSERT(err == cudaSuccess);
+			starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
 		}
 		}
 #endif
 #endif
 
 
@@ -792,8 +791,7 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _s
 			 * proper CUDA device in case it is needed. This avoids
 			 * proper CUDA device in case it is needed. This avoids
 			 * having to set it again in the malloc method of each
 			 * having to set it again in the malloc method of each
 			 * interface. */
 			 * interface. */
-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(dst_node));
-			STARPU_ASSERT(err == cudaSuccess);
+			starpu_cuda_set_device(_starpu_memory_node_to_devid(dst_node));
 		}
 		}
 #endif
 #endif
 
 

+ 28 - 5
src/drivers/cuda/driver_cuda.c

@@ -26,6 +26,7 @@
 #include <drivers/driver_common/driver_common.h>
 #include <drivers/driver_common/driver_common.h>
 #include "driver_cuda.h"
 #include "driver_cuda.h"
 #include <core/sched_policy.h>
 #include <core/sched_policy.h>
+#include <cuda_gl_interop.h>
 
 
 /* the number of CUDA devices */
 /* the number of CUDA devices */
 static int ncudagpus;
 static int ncudagpus;
@@ -108,14 +109,38 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 	return &props[devid];
 	return &props[devid];
 }
 }
 
 
-static void init_context(int devid)
+void starpu_cuda_set_device(int devid)
 {
 {
 	cudaError_t cures;
 	cudaError_t cures;
-	int workerid = starpu_worker_get_id();
+	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
+	unsigned i;
+
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	if (conf->n_cuda_opengl_interoperability) {
+		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
+		STARPU_ASSERT(0);
+	}
+#else
+	for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
+		if (conf->cuda_opengl_interoperability[i] == devid) {
+			cures = cudaGLSetGLDevice(devid);
+			goto done;
+		}
+#endif
 
 
 	cures = cudaSetDevice(devid);
 	cures = cudaSetDevice(devid);
+
+done:
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
+}
+
+static void init_context(int devid)
+{
+	cudaError_t cures;
+	int workerid = starpu_worker_get_id();
+
+	starpu_cuda_set_device(devid);
 
 
 	/* force CUDA to initialize the context for real */
 	/* force CUDA to initialize the context for real */
 	cures = cudaFree(0);
 	cures = cudaFree(0);
@@ -231,9 +256,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
 
 #ifdef HAVE_CUDA_MEMCPY_PEER
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	/* We make sure we do manipulate the proper device */
 	/* We make sure we do manipulate the proper device */
-	cures = cudaSetDevice(args->devid);
-	if (STARPU_UNLIKELY(cures != cudaSuccess))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	starpu_cuda_set_device(args->devid);
 #endif
 #endif
 
 
 	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
 	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);

+ 3 - 3
tests/datawizard/gpu_register.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  * Copyright (C) 2012 inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -95,7 +95,7 @@ test_cuda(void)
 	size = 10 * n;
 	size = 10 * n;
 
 
 	devid = starpu_worker_get_devid(chosen);
 	devid = starpu_worker_get_devid(chosen);
-	cudaSetDevice(devid);
+	starpu_cuda_set_device(devid);
 	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
 	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
 
 
 	foo = calloc(size, sizeof(*foo));
 	foo = calloc(size, sizeof(*foo));
@@ -133,7 +133,7 @@ test_cuda(void)
 	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
 	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
 	starpu_data_unregister(handle);
 	starpu_data_unregister(handle);
 
 
-	cudaSetDevice(devid);
+	starpu_cuda_set_device(devid);
 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 1 - 1
tests/experiments/latency/cuda_latency.c

@@ -113,7 +113,7 @@ void *launch_gpu_thread(void *arg)
 	unsigned *idptr = arg;
 	unsigned *idptr = arg;
 	unsigned id = *idptr;
 	unsigned id = *idptr;
 
 
-	cudaSetDevice(id);
+	starpu_cuda_set_device(id);
 	cudaFree(0);
 	cudaFree(0);
 
 
 	cudaMalloc(&gpu_buffer[id], buffer_size);
 	cudaMalloc(&gpu_buffer[id], buffer_size);