Browse Source

Add OpenGL interoperability support

Samuel Thibault 13 years ago
parent
commit
acce0b71ce

+ 1 - 0
configure.ac

@@ -1312,6 +1312,7 @@ fi
 AC_MSG_CHECKING(whether OpenGL rendering is enabled)
 AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
 AC_MSG_RESULT($enable_opengl_render)
+AM_CONDITIONAL([HAVE_OPENGL], [test "x$enable_opengl_render" = xyes])
 
 AC_PATH_XTRA
 if test "x$no_x" != "xyes"; then

+ 18 - 10
doc/chapters/advanced-examples.texi

@@ -913,8 +913,22 @@ Graphical-oriented applications need to draw the result of their computations,
 typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
 interoperability permit to let CUDA directly work on the OpenGL buffers, making
 them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
-renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
-be given the CUDA pointer at registration, for instance:
+renderbuffer objects into CUDA.  CUDA however imposes some technical
+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
+to be the one that runs CUDA computations for that GPU.
+
+To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
+to @code{./configure} (TODO: make it dynamic), the interoperability mode has to
+be enabled by using the @code{cuda_opengl_interoperability} field of the
+@code{starpu_conf} structure, and the driver loop has to be run by
+the application, by using the @code{not_launched_drivers} field of
+@code{starpu_conf} to prevent StarPU from running it in a separate thread, and
+by using @code{starpu_run_driver} to run the loop. The @code{gl_interop} example
+shows how it articulates in a simple case, where rendering is done in task
+callbacks. TODO: provide glutIdleFunc alternative.
+
+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
+the CUDA pointer at registration, for instance:
 
 @cartouche
 @smallexample
@@ -922,21 +936,15 @@ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
                 break;
 
-cudaSetDevice(starpu_worker_get_devid(workerid));
 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
 starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
 
 starpu_insert_task(&cl, STARPU_RW, handle, 0);
-
-starpu_data_unregister(handle);
-
-cudaSetDevice(starpu_worker_get_devid(workerid));
-cudaGraphicsUnmapResources(1, &resource, 0);
-
-/* Now display it */
 @end smallexample
 @end cartouche
 
+and display it e.g. in the callback function.
+
 @node More examples
 @section More examples
 

+ 15 - 0
doc/chapters/basic-api.texi

@@ -141,6 +141,15 @@ The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 
+@item @code{int *cuda_opengl_interoperability} (default = NULL)
+This can be set to an array of CUDA device identifiers for which
+@code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
+size is specified by the @code{n_cuda_opengl_interoperability} field below
+
+@item @code{int *n_cuda_opengl_interoperability} (default = 0)
+This has to be set to the size of the array pointed to by the
+@code{cuda_opengl_interoperability} field.
+
 @item @code{struct starpu_driver *not_launched_drivers}
 The drivers that should not be launched by StarPU.
 
@@ -2158,6 +2167,12 @@ successfull. It returns 0 if the synchronous copy was successful, or
 fails otherwise.
 @end deftypefun
 
+@deftypefun void starpu_cuda_set_device (int@var{devid})
+Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
+whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
+the @code{starpu_conf} structure.
+@end deftypefun
+
 @deftypefun void starpu_helper_cublas_init (void)
 This function initializes CUBLAS on every CUDA device.
 The CUBLAS library must be initialized prior to any CUBLAS call. Calling

+ 15 - 0
examples/Makefile.am

@@ -826,6 +826,21 @@ pi_pi_redux_LDADD =				\
 	$(STARPU_CURAND_LDFLAGS)
 endif
 
+###########################
+# OpenGL interoperability #
+###########################
+
+if HAVE_OPENGL
+examplebin_PROGRAMS +=				\
+	gl_interop/gl_interop
+
+gl_interop_gl_interop_SOURCES =			\
+	gl_interop/gl_interop.c
+
+gl_interop_gl_interop_LDADD =			\
+	$(STARPU_OPENGL_RENDER_LDFLAGS)
+endif
+
 showcheck:
 	-cat $(TEST_LOGS) /dev/null
 	for i in $(SUBDIRS) ; do \

+ 130 - 0
examples/gl_interop/gl_interop.c

@@ -0,0 +1,130 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example demonstrates how to use StarPU combined with OpenGL rendering,
+ * which needs:
+ *
+ * - initializing GLUT first,
+ * - enabling it at initialization,
+ * - running the corresponding CUDA worker in the GLUT thread (here, the main
+ *   thread).
+ */
+
+#include <starpu.h>
+#include <unistd.h>
+#include <GL/glut.h>
+
+void dummy(void *buffers[], void *cl_arg)
+{
+	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+
+	printf("Codelet running\n");
+	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
+	printf("Codelet done\n");
+}
+
+struct starpu_codelet cl = {
+	.where = STARPU_CUDA,
+	.cuda_funcs = { dummy, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_W },
+};
+
+void foo(void) {
+}
+
+void display(float i) {
+	glClear(GL_COLOR_BUFFER_BIT);
+	glColor3f(1, 1, 1);
+	glBegin(GL_LINES);
+	glVertex2f(-i, -i);
+	glVertex2f(i, i);
+	glEnd();
+	glFinish();
+	glutPostRedisplay();
+	glutMainLoopEvent();
+}
+
+void callback_func(void *foo) {
+	printf("Callback running, rendering\n");
+	float i = 1.;
+	while (i > 0) {
+		usleep(100000);
+		display(i);
+		i -= 0.1;
+	}
+	printf("rendering done\n");
+
+	/* Tell it was already the last submitted task */
+	starpu_set_end_of_submissions();
+}
+
+int main(int argc, char **argv)
+{
+#if !(defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
+	return 77;
+#else
+	struct starpu_conf conf;
+	int cuda_device = 0;
+	int cuda_devices[] = { cuda_device };
+	struct starpu_driver drivers[] = {
+		{ .type = STARPU_CUDA_WORKER, .id.cuda_id = cuda_device }
+	};
+	int ret;
+	struct starpu_task *task;
+	starpu_data_handle_t handle;
+
+	glutInit(&argc, argv);
+	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
+	glutInitWindowPosition(0, 0);
+	glutInitWindowSize(300,200);
+	glutCreateWindow("StarPU OpenGL interoperability test");
+	glClearColor (0.5, 0.5, 0.5, 0.0);
+
+	/* Enable OpenGL interoperability */
+	starpu_conf_init(&conf);
+	conf.ncuda = 1;
+	conf.ncpus = 0;
+	conf.nopencl = 0;
+	conf.cuda_opengl_interoperability = cuda_devices;
+	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
+	conf.not_launched_drivers = drivers;
+	conf.n_not_launched_drivers = sizeof(drivers) / sizeof(*drivers);
+	ret = starpu_init(&conf);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_vector_data_register(&handle, -1, 0, 10, sizeof(float));
+
+	/* Submit just one dumb task */
+	task = starpu_task_create();
+	task->cl = &cl;
+	task->handles[0] = handle;
+	task->callback_func = callback_func;
+	task->callback_arg = NULL;
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* And run the driver, which will run the task */
+	printf("running the driver\n");
+	starpu_run_driver(&drivers[0]);
+	printf("finished running the driver\n");
+
+	starpu_shutdown();
+
+	return 0;
+#endif
+}

+ 4 - 0
include/starpu.h

@@ -121,6 +121,10 @@ struct starpu_conf
         /* indicate if the asynchronous copies should be disabled */
 	int disable_asynchronous_copy;
 
+	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
+	int *cuda_opengl_interoperability;
+	unsigned n_cuda_opengl_interoperability;
+
 	/* A driver that the application will run in one of its own threads. */
 	struct starpu_driver *not_launched_drivers;
 	unsigned n_not_launched_drivers;

+ 3 - 1
include/starpu_cuda.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -46,6 +46,8 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 
+void starpu_cuda_set_device(int devid);
+
 #ifdef __cplusplus
 }
 #endif

+ 7 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -92,8 +92,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	_starpu_bind_thread_on_cpu(config, cpu);
 	size_t size = SIZE;
 
-	/* Initiliaze CUDA context on the device */
-	cudaSetDevice(dev);
+	/* Initialize CUDA context on the device */
+	starpu_cuda_set_device(dev);
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -185,8 +185,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
 
-	/* Initiliaze CUDA context on the source */
-	cudaSetDevice(src);
+	/* Initialize CUDA context on the source */
+	starpu_cuda_set_device(src);
 
 	/* Allocate a buffer on the device */
 	unsigned char *s_buffer;
@@ -194,8 +194,8 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	STARPU_ASSERT(s_buffer);
 	cudaMemset(s_buffer, 0, size);
 
-	/* Initiliaze CUDA context on the destination */
-	cudaSetDevice(dst);
+	/* Initialize CUDA context on the destination */
+	starpu_cuda_set_device(dst);
 
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
@@ -222,7 +222,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
 	/* Free buffers */
 	cudaFree(d_buffer);
-	cudaSetDevice(src);
+	starpu_cuda_set_device(src);
 	cudaFree(s_buffer);
 
 	cudaThreadExit();

+ 27 - 1
src/core/task.c

@@ -596,10 +596,15 @@ int starpu_task_wait_for_no_ready(void)
 
 void _starpu_decrement_nsubmitted_tasks(void)
 {
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 
-	if (--nsubmitted == 0)
+	if (--nsubmitted == 0) {
+		if (!config->submitting)
+			config->running = 0;
 		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
+	}
 
 	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
 
@@ -607,6 +612,27 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
 }
 
+void
+starpu_set_end_of_submissions(void)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+
+	config->submitting = 0;
+	if (nsubmitted == 0) {
+		config->running = 0;
+		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+}
+
+void _starpu_check_nsubmitted_tasks(void)
+{
+
+}
+
 static void _starpu_increment_nsubmitted_tasks(void)
 {
 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);

+ 2 - 9
src/core/workers.c

@@ -249,6 +249,7 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 {
 	config->running = 1;
+	config->submitting = 1;
 
 	pthread_key_create(&worker_key, NULL);
 
@@ -424,6 +425,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 	if (!conf)
 		return -EINVAL;
 
+	memset(conf, 0, sizeof(*conf));
 	conf->magic = 42;
 	conf->sched_policy_name = getenv("STARPU_SCHED");
 	conf->sched_policy = NULL;
@@ -968,15 +970,6 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 	config.workers[workerid].sched_mutex = sched_mutex;
 }
 
-void
-starpu_set_end_of_submissions(void)
-{
-	struct _starpu_machine_config *config;
-	config = _starpu_get_machine_config();
-	starpu_task_wait_for_all();
-	config->running = 0;
-}
-
 #ifdef STARPU_USE_CUDA
 extern int _starpu_run_cuda(struct starpu_driver *);
 #endif

+ 3 - 0
src/core/workers.h

@@ -160,6 +160,9 @@ struct _starpu_machine_config
 
 	/* this flag is set until the runtime is stopped */
 	unsigned running;
+
+	/* this flag is set until the application is finished submitting tasks */
+	unsigned submitting;
 };
 
 /* Has starpu_shutdown already been called ? */

+ 1 - 2
src/datawizard/copy_driver.c

@@ -117,8 +117,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
 	{
 		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
-		cures = cudaSetDevice(_starpu_memory_node_to_devid(node));
-		STARPU_ASSERT(cures == cudaSuccess);
+		starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
 	}
 #endif
 

+ 2 - 4
src/datawizard/interfaces/matrix_interface.c

@@ -456,16 +456,14 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	/* That code is not even working!! */
 	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
 
-	cures = cudaSetDevice(src_dev);
-	STARPU_ASSERT(cures == cudaSuccess);
+	starpu_cuda_set_device(src_dev);
 
 	struct cudaPitchedPtr mem_device1;
 	cures = cudaMalloc3D(&mem_device1, extent);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaSetDevice(dst_dev);
-	STARPU_ASSERT(cures == cudaSuccess);
+	starpu_cuda_set_device(dst_dev);
 
 	struct cudaPitchedPtr mem_device2;
 	cures = cudaMalloc3D(&mem_device2, extent);

+ 2 - 4
src/datawizard/memalloc.c

@@ -249,8 +249,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, uint32_t node)
 			 * proper CUDA device in case it is needed. This avoids
 			 * having to set it again in the free method of each
 			 * interface. */
-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(node));
-			STARPU_ASSERT(err == cudaSuccess);
+			starpu_cuda_set_device(_starpu_memory_node_to_devid(node));
 		}
 #endif
 
@@ -792,8 +791,7 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _s
 			 * proper CUDA device in case it is needed. This avoids
 			 * having to set it again in the malloc method of each
 			 * interface. */
-			cudaError_t err = cudaSetDevice(_starpu_memory_node_to_devid(dst_node));
-			STARPU_ASSERT(err == cudaSuccess);
+			starpu_cuda_set_device(_starpu_memory_node_to_devid(dst_node));
 		}
 #endif
 

+ 28 - 5
src/drivers/cuda/driver_cuda.c

@@ -26,6 +26,7 @@
 #include <drivers/driver_common/driver_common.h>
 #include "driver_cuda.h"
 #include <core/sched_policy.h>
+#include <cuda_gl_interop.h>
 
 /* the number of CUDA devices */
 static int ncudagpus;
@@ -108,14 +109,38 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 	return &props[devid];
 }
 
-static void init_context(int devid)
+void starpu_cuda_set_device(int devid)
 {
 	cudaError_t cures;
-	int workerid = starpu_worker_get_id();
+	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
+	unsigned i;
+
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	if (conf->n_cuda_opengl_interoperability) {
+		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
+		STARPU_ASSERT(0);
+	}
+#else
+	for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
+		if (conf->cuda_opengl_interoperability[i] == devid) {
+			cures = cudaGLSetGLDevice(devid);
+			goto done;
+		}
+#endif
 
 	cures = cudaSetDevice(devid);
+
+done:
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
+}
+
+static void init_context(int devid)
+{
+	cudaError_t cures;
+	int workerid = starpu_worker_get_id();
+
+	starpu_cuda_set_device(devid);
 
 	/* force CUDA to initialize the context for real */
 	cures = cudaFree(0);
@@ -231,9 +256,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	/* We make sure we do manipulate the proper device */
-	cures = cudaSetDevice(args->devid);
-	if (STARPU_UNLIKELY(cures != cudaSuccess))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	starpu_cuda_set_device(args->devid);
 #endif
 
 	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);

+ 3 - 3
tests/datawizard/gpu_register.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -95,7 +95,7 @@ test_cuda(void)
 	size = 10 * n;
 
 	devid = starpu_worker_get_devid(chosen);
-	cudaSetDevice(devid);
+	starpu_cuda_set_device(devid);
 	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
 
 	foo = calloc(size, sizeof(*foo));
@@ -133,7 +133,7 @@ test_cuda(void)
 	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
 	starpu_data_unregister(handle);
 
-	cudaSetDevice(devid);
+	starpu_cuda_set_device(devid);
 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 1 - 1
tests/experiments/latency/cuda_latency.c

@@ -113,7 +113,7 @@ void *launch_gpu_thread(void *arg)
 	unsigned *idptr = arg;
 	unsigned id = *idptr;
 
-	cudaSetDevice(id);
+	starpu_cuda_set_device(id);
 	cudaFree(0);
 
 	cudaMalloc(&gpu_buffer[id], buffer_size);