лет назад: 13 · 2b30d49db1
--- a/configure.ac
+++ b/configure.ac
@@ -854,6 +854,7 @@ if test x$enable_simgrid = xyes ; then
 
				 	# We won't bind or detect anything
			
 
				 	with_hwloc=no
			
 
				 fi
			
 
				+AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
			
 
				 AC_SUBST(SIMGRID_CFLAGS)
			
 
				 AC_SUBST(SIMGRID_LIBS)
			
 
				 
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -77,6 +77,10 @@ void chol_cpu_codelet_update_u22(void **, void *);
 
				 void chol_cublas_codelet_update_u11(void *descr[], void *_args);
			
 
				 void chol_cublas_codelet_update_u21(void *descr[], void *_args);
			
 
				 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				+#elif defined(STARPU_SIMGRID)
			
 
				+#define chol_cublas_codelet_update_u11 ((void*)1)
			
 
				+#define chol_cublas_codelet_update_u21 ((void*)1)
			
 
				+#define chol_cublas_codelet_update_u22 ((void*)1)
			
 
				 #endif
			
 
				 
			
 
				 extern struct starpu_perfmodel chol_model_11;
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -27,7 +27,7 @@ static struct starpu_codelet cl11 =
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined (STARPU_SIMGRID)
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
@@ -40,7 +40,7 @@ static struct starpu_codelet cl21 =
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined (STARPU_SIMGRID)
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
@@ -54,7 +54,7 @@ static struct starpu_codelet cl22 =
 
				 	.type = STARPU_SEQ,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined (STARPU_SIMGRID)
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -81,6 +81,8 @@ struct starpu_driver
 
				 		unsigned cuda_id;
			
 
				 #if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
 
				 		cl_device_id opencl_id;
			
 
				+#elif defined(STARPU_SIMGRID)
			
 
				+		unsigned opencl_id;
			
 
				 #endif
			
 
				 		/*
			
 
				 		 * HOWTO: add a new kind of device to the starpu_driver structure.
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011-2012  Institut National de Recherche en Informatique et Automatique
			
 
				  *
			
@@ -75,6 +75,10 @@ struct starpu_data_copy_methods
 
				 	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+#else
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	int cuda_to_cuda_async;
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 #if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -229,6 +229,10 @@ endif
 
				 
			
 
				 if STARPU_USE_CUDA
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/driver_cuda.c
			
 
				+else
			
 
				+if STARPU_SIMGRID
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/driver_cuda.c
			
 
				+endif
			
 
				 endif
			
 
				 
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
			
@@ -236,6 +240,10 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
 
				 if STARPU_USE_OPENCL
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl_utils.c
			
 
				+else
			
 
				+if STARPU_SIMGRID
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
			
 
				+endif
			
 
				 endif
			
 
				 
			
 
				 showcheck:
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -63,11 +63,9 @@ struct dev_timing
 
				 static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
			
 
				 static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
			
 
				 static unsigned was_benchmarked = 0;
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 static unsigned ncpus = 0;
			
 
				 static int ncuda = 0;
			
 
				 static int nopencl = 0;
			
 
				-#endif
			
 
				 
			
 
				 /* Benchmarking the performance of the bus */
			
 
				 
			
@@ -812,6 +810,7 @@ static void load_bus_affinity_file_content(void)
 
				 
			
 
				 }
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 static void write_bus_affinity_file_content(void)
			
 
				 {
			
 
				 	STARPU_ASSERT(was_benchmarked);
			
@@ -870,6 +869,7 @@ static void write_bus_affinity_file_content(void)
 
				 	fclose(f);
			
 
				 #endif
			
 
				 }
			
 
				+#endif /* STARPU_SIMGRID */
			
 
				 
			
 
				 static void generate_bus_affinity_file(void)
			
 
				 {
			
@@ -895,7 +895,6 @@ static void load_bus_affinity_file(void)
 
				 
			
 
				 	load_bus_affinity_file_content();
			
 
				 }
			
 
				-#endif /* !SIMGRID */
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
			
@@ -945,6 +944,7 @@ void starpu_bus_print_affinity(FILE *f)
 
				 	}
			
 
				 #endif
			
 
				 }
			
 
				+#endif /* STARPU_SIMGRID */
			
 
				 
			
 
				 /*
			
 
				  *	Latency
			
@@ -1343,6 +1343,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 
			
 
				 	fclose(f);
			
 
				 }
			
 
				+#endif /* STARPU_SIMGRID */
			
 
				 
			
 
				 void starpu_bus_print_bandwidth(FILE *f)
			
 
				 {
			
@@ -1434,7 +1435,6 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 	}
			
 
				 #endif
			
 
				 }
			
 
				-#endif
			
 
				 
			
 
				 static void generate_bus_bandwidth_file(void)
			
 
				 {
			
@@ -1591,6 +1591,13 @@ void _starpu_load_bus_performance_files(void)
 
				 {
			
 
				 	_starpu_create_sampling_directory_if_needed();
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_SIMGRID)
			
 
				+	ncuda = _starpu_get_cuda_device_count();
			
 
				+#endif
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SIMGRID)
			
 
				+	nopencl = _starpu_opencl_get_device_count();
			
 
				+#endif
			
 
				+
			
 
				 #ifndef STARPU_SIMGRID
			
 
				         check_bus_config_file();
			
 
				 	load_bus_affinity_file();
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -431,7 +431,7 @@ int _starpu_push_task_end(struct starpu_task *task)
 
				 struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
			
 
				 						   unsigned int node)
			
 
				 {
			
 
				-	_starpu_create_conversion_task_for_arch(handle, starpu_node_get_kind(node));
			
 
				+	return _starpu_create_conversion_task_for_arch(handle, starpu_node_get_kind(node));
			
 
				 }
			
 
				 
			
 
				 struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t handle,
			
@@ -459,7 +459,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
				 		{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			STARPU_ABORT();
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 		case STARPU_CUDA_RAM:
			
 
				 		{
			
 
				 			struct starpu_multiformat_data_interface_ops *mf_ops;
			
@@ -468,7 +468,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
				 			break;
			
 
				 		}
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 		case STARPU_OPENCL_RAM:
			
 
				 		{
			
 
				 			struct starpu_multiformat_data_interface_ops *mf_ops;
			
@@ -481,7 +481,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
				 			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
			
 
				 		}
			
 
				 		break;
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 	case STARPU_CUDA_RAM:
			
 
				 		{
			
 
				 			struct starpu_multiformat_data_interface_ops *mf_ops;
			
@@ -490,7 +490,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
				 			break;
			
 
				 		}
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 	case STARPU_OPENCL_RAM:
			
 
				 	{
			
 
				 		struct starpu_multiformat_data_interface_ops *mf_ops;
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -133,6 +133,24 @@ int do_starpu_main(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBU
 
				 	return starpu_main(args->argc, args->argv);
			
 
				 }
			
 
				 
			
 
				+int _starpu_simgrid_get_nbhosts(const char *prefix)
			
 
				+{
			
 
				+	int ret;
			
 
				+	xbt_dynar_t hosts = MSG_hosts_as_dynar();
			
 
				+	unsigned i, nb = xbt_dynar_length(hosts);
			
 
				+	unsigned len = strlen(prefix);
			
 
				+
			
 
				+	ret = 0;
			
 
				+	for (i = 0; i < nb; i++) {
			
 
				+		const char *name;
			
 
				+		name = MSG_host_get_name(xbt_dynar_get_as(hosts, i, msg_host_t));
			
 
				+		if (!strncmp(name, prefix, len))
			
 
				+			ret++;
			
 
				+	}
			
 
				+	xbt_dynar_free(&hosts);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning TODO: use another way to start main, when simgrid provides it, and then include the application-provided configuration for platform numbers
			
 
				 #endif
			
@@ -144,7 +162,6 @@ int main(int argc, char **argv)
 
				 	char name[] = "/tmp/starpu-simgrid-platform.xml.XXXXXX";
			
 
				 	int fd;
			
 
				 	FILE *file;
			
 
				-	struct starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
			
 
				 
			
 
				 	if (!starpu_main)
			
 
				 	{
			
@@ -161,38 +178,18 @@ int main(int argc, char **argv)
 
				 	/* Create platform file */
			
 
				 	starpu_conf_init(&conf);
			
 
				 	if ((!getenv("STARPU_NCPUS") && !getenv("STARPU_NCPU"))
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	 || !getenv("STARPU_NCUDA")
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	 || !getenv("STARPU_NOPENCL")
			
 
				-#endif
			
 
				-			)
			
 
				+	  || !getenv("STARPU_NCUDA")
			
 
				+	  || !getenv("STARPU_NOPENCL"))
			
 
				 	{
			
 
				-		_STARPU_ERROR("Please specify the number of cpus and gpus by setting the environment variables STARPU_NCPU%s%s\n",
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-			      ", STARPU_NCUDA",
			
 
				-#else
			
 
				-			      "",
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-			      ", STARPU_NOPENCL"
			
 
				-#else
			
 
				-			      ""
			
 
				-#endif
			
 
				-			);
			
 
				+		_STARPU_ERROR("Please specify the number of cpus and gpus by setting the environment variables STARPU_NCPU, STARPU_NCUDA, STARPU_NOPENCL\n");
			
 
				 		exit(EXIT_FAILURE);
			
 
				 	}
			
 
				 	_starpu_conf_check_environment(&conf);
			
 
				 
			
 
				 	_starpu_load_bus_performance_files();
			
 
				 
			
 
				-	topology->ncpus = conf.ncpus;
			
 
				-	topology->ncudagpus = conf.ncuda;
			
 
				-	topology->nopenclgpus = conf.nopencl;
			
 
				-
			
 
				-	/* TODO: rather use simgrid/platf.h */
			
 
				-	/* TODO: but still permit the user to provide his own xml */
			
 
				+	/* TODO: make the user to provide his own xml */
			
 
				+	/* And remove the hack in _starpu_cpu_discover_devices */
			
 
				 	fd = mkstemp(name);
			
 
				 	file = fdopen(fd, "w");
			
 
				 	starpu_simgrid_write_platform(&conf, file);
			
@@ -324,8 +321,10 @@ static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARP
 
				 {
			
 
				 	struct transfer *transfer = MSG_process_get_data(MSG_process_self());
			
 
				 	unsigned i;
			
 
				+	_STARPU_DEBUG("transfer %p started\n", transfer);
			
 
				 	MSG_task_execute(transfer->task);
			
 
				 	MSG_task_destroy(transfer->task);
			
 
				+	_STARPU_DEBUG("transfer %p finished\n", transfer);
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(transfer->mutex);
			
 
				 	*transfer->finished = 1;
			
 
				 	_STARPU_PTHREAD_COND_BROADCAST(transfer->cond);
			
@@ -409,6 +408,9 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 
				 	task = MSG_parallel_task_create("copy", 2, hosts, computation, communication, NULL);
			
 
				 
			
 
				 	struct transfer *transfer = transfer_new();
			
 
				+
			
 
				+	_STARPU_DEBUG("creating transfer %p for %lu bytes\n", transfer, (unsigned long) size);
			
 
				+
			
 
				 	transfer->task = task;
			
 
				 	transfer->src_node = src_node;
			
 
				 	transfer->dst_node = dst_node;
			
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -22,6 +22,8 @@
 
				 
			
 
				 void _starpu_simgrid_execute_job(struct _starpu_job *job, enum starpu_perf_archtype perf_arch, double length);
			
 
				 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
			
 
				+/* Return the number of hosts prefixed by PREFIX */
			
 
				+int _starpu_simgrid_get_nbhosts(const char *prefix);
			
 
				 #endif
			
 
				 
			
 
				 #endif // __SIMGRID_H__
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
@@ -45,7 +45,7 @@
 
				 
			
 
				 static unsigned topology_is_initialized = 0;
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 
			
 
				 struct handle_entry
			
 
				 {
			
@@ -53,7 +53,7 @@ struct handle_entry
 
				 	unsigned gpuid;
			
 
				 };
			
 
				 
			
 
				-#  ifdef STARPU_USE_CUDA
			
 
				+#  if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 /* Entry in the `devices_using_cuda' hash table.  */
			
 
				 static struct handle_entry *devices_using_cuda;
			
 
				 #  endif
			
@@ -67,7 +67,7 @@ static unsigned may_bind_automatically = 0;
 
				  * Discover the topology of the machine
			
 
				  */
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 static void
			
 
				 _starpu_initialize_workers_gpuid (int *explicit_workers_gpuid,
			
 
				 				  int *current, int *workers_gpuid,
			
@@ -150,7 +150,7 @@ _starpu_initialize_workers_gpuid (int *explicit_workers_gpuid,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 static void
			
 
				 _starpu_initialize_workers_cuda_gpuid (struct _starpu_machine_config *config)
			
 
				 {
			
@@ -177,7 +177,7 @@ _starpu_get_next_cuda_gpuid (struct _starpu_machine_config *config)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 static void
			
 
				 _starpu_initialize_workers_opencl_gpuid (struct _starpu_machine_config*config)
			
 
				 {
			
@@ -272,23 +272,18 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 
				 	if (topology_is_initialized)
			
 
				 		return;
			
 
				 
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	struct starpu_conf *conf = config->conf;
			
 
				-	topology->nhwcpus = conf->ncpus?conf->ncpus:1;
			
 
				-	topology->nhwcudagpus = conf->ncuda;
			
 
				-	topology->nhwopenclgpus = conf->nopencl;
			
 
				-#else
			
 
				 	topology->nhwcpus = 0;
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	hwloc_topology_init(&topology->hwtopology);
			
 
				 	hwloc_topology_load(topology->hwtopology);
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				 	_starpu_cpu_discover_devices(config);
			
 
				 	_starpu_cuda_discover_devices(config);
			
 
				 	_starpu_opencl_discover_devices(config);
			
 
				-#endif
			
 
				 
			
 
				 	topology_is_initialized = 1;
			
 
				 }
			
@@ -428,10 +423,10 @@ _starpu_get_next_bindid (struct _starpu_machine_config *config,
 
				 unsigned
			
 
				 _starpu_topology_get_nhwcpu (struct _starpu_machine_config *config)
			
 
				 {
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 	_starpu_opencl_init();
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 	_starpu_init_cuda();
			
 
				 #endif
			
 
				 	_starpu_init_topology(config);
			
@@ -452,20 +447,19 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 
				 	topology->ncombinedworkers = 0;
			
 
				 	topology->nsched_ctxs = 0;
			
 
				 
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 	_starpu_opencl_init();
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 	_starpu_init_cuda();
			
 
				 #endif
			
 
				 	_starpu_init_topology(config);
			
 
				 
			
 
				 	_starpu_initialize_workers_bindid(config);
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 	int ncuda = config->conf->ncuda;
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 	if (ncuda != 0)
			
 
				 	{
			
 
				 		/* The user did not disable CUDA. We need to initialize CUDA
			
@@ -489,7 +483,6 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-#endif
			
 
				 
			
 
				 	/* Now we know how many CUDA devices will be used */
			
 
				 	topology->ncudagpus = ncuda;
			
@@ -521,10 +514,9 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 
				 	topology->nworkers += topology->ncudagpus;
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 	int nopencl = config->conf->nopencl;
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 	if (nopencl != 0)
			
 
				 	{
			
 
				 		/* The user did not disable OPENCL. We need to initialize
			
@@ -561,7 +553,6 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-#endif
			
 
				 
			
 
				 	topology->nopenclgpus = nopencl;
			
 
				 	STARPU_ASSERT(topology->nopenclgpus + topology->nworkers <= STARPU_NMAXWORKERS);
			
@@ -633,7 +624,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 
				 
			
 
				 /* we put the CPU section after the accelerator : in case there was an
			
 
				  * accelerator found, we devote one cpu */
			
 
				-#ifdef STARPU_USE_CPU
			
 
				+#if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
			
 
				 	int ncpu = config->conf->ncpus;
			
 
				 
			
 
				 	if (ncpu != 0)
			
@@ -847,14 +838,16 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config)
 
				 				_starpu_memory_node_worker_add(ram_memory_node);
			
 
				 				break;
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 				if (may_bind_automatically)
			
 
				 				{
			
 
				 					/* StarPU is allowed to bind threads automatically */
			
 
				 					preferred_binding = _starpu_get_cuda_affinity_vector(workerarg->devid);
			
 
				 					npreferred = config->topology.nhwcpus;
			
 
				 				}
			
 
				+#endif
			
 
				 				is_a_set_of_accelerators = 0;
			
 
				 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM, workerarg->devid);
			
 
				 #ifdef STARPU_SIMGRID
			
@@ -883,14 +876,16 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config)
 
				 				break;
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 		        case STARPU_OPENCL_WORKER:
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 				if (may_bind_automatically)
			
 
				 				{
			
 
				 					/* StarPU is allowed to bind threads automatically */
			
 
				 					preferred_binding = _starpu_get_opencl_affinity_vector(workerarg->devid);
			
 
				 					npreferred = config->topology.nhwcpus;
			
 
				 				}
			
 
				+#endif
			
 
				 				is_a_set_of_accelerators = 0;
			
 
				 				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM, workerarg->devid);
			
 
				 #ifdef STARPU_SIMGRID
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -124,17 +124,17 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 
				 	if (!task->cl->can_execute)
			
 
				 		return 1;
			
 
				 
			
 
				-#ifdef STARPU_USE_CPU
			
 
				+#if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
			
 
				 	if ((task->cl->where & STARPU_CPU) &&
			
 
				 	    _starpu_worker_exists_and_can_execute(task, STARPU_CPU_WORKER))
			
 
				 		return 1;
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 	if ((task->cl->where & STARPU_CUDA) &&
			
 
				 	    _starpu_worker_exists_and_can_execute(task, STARPU_CUDA_WORKER))
			
 
				 		return 1;
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 	if ((task->cl->where & STARPU_OPENCL) &&
			
 
				 	    _starpu_worker_exists_and_can_execute(task, STARPU_OPENCL_WORKER))
			
 
				 		return 1;
			
@@ -165,19 +165,19 @@ static int _starpu_can_use_nth_implementation(enum starpu_archtype arch, struct
 
				 	{
			
 
				 		int cpu_func_enabled=1, cuda_func_enabled=1, opencl_func_enabled=1, gordon_func_enabled=1;
			
 
				 
			
 
				-#ifdef STARPU_USE_CPU
			
 
				+#if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
			
 
				 		starpu_cpu_func_t cpu_func = _starpu_task_get_cpu_nth_implementation(cl, nimpl);
			
 
				 		cpu_func_enabled = cpu_func != NULL && starpu_cpu_worker_get_count();
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 		starpu_cuda_func_t cuda_func = _starpu_task_get_cuda_nth_implementation(cl, nimpl);
			
 
				 		cuda_func_enabled = cuda_func != NULL && starpu_cuda_worker_get_count();
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 		starpu_opencl_func_t opencl_func = _starpu_task_get_opencl_nth_implementation(cl, nimpl);
			
 
				 		opencl_func_enabled = opencl_func != NULL && starpu_opencl_worker_get_count();
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				+#if defined(STARPU_USE_GORDON) || defined(STARPU_SIMGRID)
			
 
				 		starpu_gordon_func_t gordon_func = _starpu_task_get_gordon_nth_implementation(cl, nimpl);
			
 
				 		gordon_func_enabled = gordon_func != 0 && starpu_spu_worker_get_count();
			
 
				 #endif
			
@@ -433,7 +433,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 		driver.type = workerarg->arch;
			
 
				 		switch (workerarg->arch)
			
 
				 		{
			
 
				-#ifdef STARPU_USE_CPU
			
 
				+#if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
			
 
				 			case STARPU_CPU_WORKER:
			
 
				 				workerarg->set = NULL;
			
 
				 				driver.id.cpu_id = cpu;
			
@@ -460,7 +460,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 				cpu++;
			
 
				 				break;
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				 				workerarg->set = NULL;
			
 
				 				driver.id.cuda_id = cuda;
			
@@ -487,14 +487,16 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 				cuda++;
			
 
				 				break;
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 			case STARPU_OPENCL_WORKER:
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
			
 
				 				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				 				{
			
 
				 					workerarg->run_by_starpu = 0;
			
 
				 					break;
			
 
				 				}
			
 
				+#endif
			
 
				 				workerarg->set = NULL;
			
 
				 				_STARPU_PTHREAD_CREATE_ON(
			
 
				 					workerarg->name,
			
@@ -511,7 +513,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 #endif
			
 
				 				break;
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				+#if defined(STARPU_USE_GORDON)
			
 
				 			case STARPU_GORDON_WORKER:
			
 
				 				/* we will only launch gordon once, but it will handle
			
 
				 				 * the different SPU workers */
			
@@ -567,6 +569,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 					cpu++;
			
 
				 					break;
			
 
				 				}
			
 
				+				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
 
				 				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				 				while (!workerarg->worker_is_initialized)
			
 
				 					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
@@ -580,6 +583,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 					cuda++;
			
 
				 					break;
			
 
				 				}
			
 
				+				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
 
				 				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				 				while (!workerarg->worker_is_initialized)
			
 
				 					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
@@ -591,6 +595,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
			
 
				 				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				 					break;
			
 
				+				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
 
				 				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				 				while (!workerarg->worker_is_initialized)
			
 
				 					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
@@ -608,6 +613,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	_STARPU_DEBUG("finished launching drivers\n");
			
 
				 }
			
 
				 
			
 
				 void _starpu_set_local_worker_key(struct _starpu_worker *worker)
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -36,13 +36,8 @@
 
				 #include <hwloc.h>
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				 #include <drivers/cuda/driver_cuda.h>
			
 
				-#endif
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				-#endif
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #include <drivers/gordon/driver_gordon.h>
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -188,7 +188,7 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 
				 	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
			
 
				 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
			
 
				 	 * method ! */
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 	if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				 	{
			
 
				 		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
			
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -307,6 +307,13 @@ static struct starpu_data_copy_methods coo_copy_data_methods =
 
				 #ifdef NO_STRIDE
			
 
				 	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				+#else
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+#ifdef NO_STRIDE
			
 
				+	/* Enable GPU-GPU transfers in simgrid */
			
 
				+	.cuda_to_cuda_async = 1,
			
 
				+#endif
			
 
				+#endif
			
 
				 #endif /* !STARPU_USE_CUDA */
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl       = copy_ram_to_opencl,
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -53,6 +53,11 @@ static struct starpu_data_copy_methods csr_copy_data_methods_s =
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				+#else
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	/* Enable GPU-GPU transfers in simgrid */
			
 
				+	.cuda_to_cuda_async = 1,
			
 
				+#endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -59,6 +59,13 @@ static struct starpu_data_copy_methods matrix_copy_data_methods_s =
 
				 #ifdef NO_STRIDE
			
 
				 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				+#else
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+#ifdef NO_STRIDE
			
 
				+	/* Enable GPU-GPU transfers in simgrid */
			
 
				+	.cuda_to_cuda_async = 1,
			
 
				+#endif
			
 
				+#endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -53,6 +53,11 @@ static struct starpu_data_copy_methods multiformat_copy_data_methods_s =
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				+#else
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	/* Enable GPU-GPU transfers in simgrid */
			
 
				+	.cuda_to_cuda_async = 1,
			
 
				+#endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -53,6 +53,11 @@ static struct starpu_data_copy_methods variable_copy_data_methods_s =
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				+#else
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	/* Enable GPU-GPU transfers in simgrid */
			
 
				+	.cuda_to_cuda_async = 1,
			
 
				+#endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -53,6 +53,11 @@ static struct starpu_data_copy_methods vector_copy_data_methods_s =
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				+#else
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	/* Enable GPU-GPU transfers in simgrid */
			
 
				+	.cuda_to_cuda_async = 1,
			
 
				+#endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -44,6 +44,11 @@ static struct starpu_data_copy_methods void_copy_data_methods_s =
 
				 	.ram_to_cuda_async = dummy_cuda_copy_async,
			
 
				 	.cuda_to_ram_async = dummy_cuda_copy_async,
			
 
				 	.cuda_to_cuda_async = dummy_cuda_copy_async,
			
 
				+#else
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	/* Enable GPU-GPU transfers in simgrid */
			
 
				+	.cuda_to_cuda_async = 1,
			
 
				+#endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = dummy_copy,
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -769,10 +769,6 @@ starpu_allocate_buffer_on_node(uint32_t dst_node, size_t size)
 
				 {
			
 
				 	uintptr_t addr = 0;
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	cudaError_t status;
			
 
				-#endif
			
 
				-
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning TODO: we need to use starpu_malloc
			
 
				 #endif
			
@@ -784,7 +780,7 @@ starpu_allocate_buffer_on_node(uint32_t dst_node, size_t size)
 
				 			_starpu_memory_manager_add_size(size);
			
 
				 			break;
			
 
				 		}
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 		case STARPU_CUDA_RAM:
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 #ifdef STARPU_DEVEL
			
@@ -794,11 +790,9 @@ starpu_allocate_buffer_on_node(uint32_t dst_node, size_t size)
 
				 			_STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
			
 
				 			MSG_process_sleep(0.000010);
			
 
				 			addr = 1;
			
 
				-			status = cudaSuccess;
			
 
				 			_STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
			
 
				 #else
			
 
				-			status = cudaMalloc((void **)&addr, size);
			
 
				-#endif
			
 
				+			cudaError_t status = cudaMalloc((void **)&addr, size);
			
 
				 			if (!addr || (status != cudaSuccess))
			
 
				 			{
			
 
				 				if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
			
@@ -806,28 +800,28 @@ starpu_allocate_buffer_on_node(uint32_t dst_node, size_t size)
 
				 
			
 
				 				addr = 0;
			
 
				 			}
			
 
				+#endif
			
 
				 			break;
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 	        case STARPU_OPENCL_RAM:
			
 
				 			{
			
 
				-                                int ret;
			
 
				-				cl_mem ptr;
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 				/* Sleep 10µs for the allocation */
			
 
				 				_STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
			
 
				 				MSG_process_sleep(0.000010);
			
 
				-				ptr = (cl_mem) 1;
			
 
				-				ret = CL_SUCCESS;
			
 
				+				addr = 1;
			
 
				 				_STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
			
 
				 #else
			
 
				+                                int ret;
			
 
				+				cl_mem ptr;
			
 
				 				ret = starpu_opencl_allocate_memory(&ptr, size, CL_MEM_READ_WRITE);
			
 
				-#endif
			
 
				 				if (ret)
			
 
				 					addr = 0;
			
 
				 				else
			
 
				 					addr = (uintptr_t)ptr;
			
 
				 				break;
			
 
				+#endif
			
 
				 			}
			
 
				 #endif
			
 
				 		default:
			
@@ -850,39 +844,37 @@ starpu_free_buffer_on_node(uint32_t dst_node, uintptr_t addr, size_t size)
 
				 			free((void*)addr);
			
 
				 			_starpu_memory_manager_sub_size(size);
			
 
				 			break;
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 		case STARPU_CUDA_RAM:
			
 
				 		{
			
 
				-			cudaError_t err;
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 			_STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
			
 
				 			/* Sleep 10µs for the free */
			
 
				 			MSG_process_sleep(0.000010);
			
 
				-			err = cudaSuccess;
			
 
				 			_STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
			
 
				 #else
			
 
				+			cudaError_t err;
			
 
				 			err = cudaFree((void*)addr);
			
 
				-#endif
			
 
				 			if (STARPU_UNLIKELY(err != cudaSuccess))
			
 
				 				STARPU_CUDA_REPORT_ERROR(err);
			
 
				+#endif
			
 
				 			break;
			
 
				 		}
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				                 case STARPU_OPENCL_RAM:
			
 
				 		{
			
 
				-			cl_int err;
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 			_STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
			
 
				 			/* Sleep 10µs for the free */
			
 
				 			MSG_process_sleep(0.000010);
			
 
				-			err = CL_SUCCESS;
			
 
				 			_STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
			
 
				 #else
			
 
				+			cl_int err;
			
 
				                         err = clReleaseMemObject((void*)addr);
			
 
				-#endif
			
 
				 			if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				 				STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+#endif
			
 
				                         break;
			
 
				 		}
			
 
				 #endif
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -44,8 +44,18 @@
 
				 #include <core/simgrid.h>
			
 
				 #endif
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				-#ifdef STARPU_HAVE_HWLOC
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+void
			
 
				+_starpu_cpu_discover_devices(struct _starpu_machine_config *config)
			
 
				+{
			
 
				+#if 0
			
 
				+	config->topology.nhwcpus = _starpu_simgrid_get_nbhosts("CPU");
			
 
				+#else
			
 
				+	/* For now, lie about the number of CPUs we actually have */
			
 
				+	config->topology.nhwcpus = STARPU_NMAXWORKERS;
			
 
				+#endif
			
 
				+}
			
 
				+#elif defined(STARPU_HAVE_HWLOC)
			
 
				 void
			
 
				 _starpu_cpu_discover_devices(struct _starpu_machine_config *config)
			
 
				 {
			
@@ -99,7 +109,6 @@ _starpu_cpu_discover_devices(struct _starpu_machine_config *config)
 
				 	config->topology.nhwcpus = 1;
			
 
				 }
			
 
				 #endif
			
 
				-#endif
			
 
				 
			
 
				 
			
 
				 /* Actually launch the job on a cpu worker.
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -37,18 +37,22 @@
 
				 /* the number of CUDA devices */
			
 
				 static int ncudagpus;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t out_transfer_streams[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t in_transfer_streams[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t peer_transfer_streams[STARPU_NMAXWORKERS];
			
 
				 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
			
 
				+#endif
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 void
			
 
				 _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
			
 
				 {
			
 
				 	/* Discover the number of CUDA devices. Fill the result in CONFIG. */
			
 
				 
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	config->topology.nhwcudagpus = _starpu_simgrid_get_nbhosts("CUDA");
			
 
				+#else
			
 
				 	int cnt;
			
 
				 	cudaError_t cures;
			
 
				 
			
@@ -56,8 +60,11 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 
				 	if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				 		cnt = 0;
			
 
				 	config->topology.nhwcudagpus = cnt;
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 /* In case we want to cap the amount of memory available on the GPUs by the
			
 
				  * mean of the STARPU_LIMIT_GPU_MEM, we allocate a big buffer when the driver
			
 
				  * is launched. */
			
@@ -283,11 +290,16 @@ static void deinit_context(int workerid, unsigned devid)
 
				 }
			
 
				 #endif /* !SIMGRID */
			
 
				 
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				+
			
 
				 /* Return the number of devices usable in the system.
			
 
				  * The value returned cannot be greater than MAXCUDADEVS */
			
 
				 
			
 
				 unsigned _starpu_get_cuda_device_count(void)
			
 
				 {
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	return _starpu_simgrid_get_nbhosts("CUDA");
			
 
				+#else
			
 
				 	int cnt;
			
 
				 
			
 
				 	cudaError_t cures;
			
@@ -301,6 +313,7 @@ unsigned _starpu_get_cuda_device_count(void)
 
				 		cnt = STARPU_MAXCUDADEVS;
			
 
				 	}
			
 
				 	return (unsigned)cnt;
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 void _starpu_init_cuda(void)
			
@@ -527,6 +540,7 @@ void *_starpu_cuda_worker(void *arg)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				 void starpu_cublas_report_error(const char *func, const char *file, int line, cublasStatus status)
			
 
				 {
			
 
				 	char *errormsg;
			
@@ -658,3 +672,5 @@ int _starpu_run_cuda(struct starpu_driver *d)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+#endif /* STARPU_USE_CUDA */
			
--- a/src/drivers/cuda/driver_cuda.h
+++ b/src/drivers/cuda/driver_cuda.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010, 2012-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -40,10 +40,14 @@
 
				 
			
 
				 unsigned _starpu_get_cuda_device_count(void);
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);
			
 
				 void _starpu_init_cuda(void);
			
 
				 void *_starpu_cuda_worker(void *);
			
 
				+#else
			
 
				+#  define _starpu_cuda_discover_devices(config) ((void) config)
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				 cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
			
 
				 cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
			
 
				 cudaStream_t starpu_cuda_get_local_peer_transfer_stream(void);
			
@@ -52,8 +56,6 @@ int _starpu_run_cuda(struct starpu_driver *);
 
				 int _starpu_cuda_driver_init(struct starpu_driver *);
			
 
				 int _starpu_cuda_driver_run_once(struct starpu_driver *);
			
 
				 int _starpu_cuda_driver_deinit(struct starpu_driver *);
			
 
				-#else
			
 
				-#  define _starpu_cuda_discover_devices(config) ((void) config)
			
 
				 #endif
			
 
				 
			
 
				 #endif //  __DRIVER_CUDA_H__
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
@@ -33,17 +33,19 @@
 
				 #include <core/simgrid.h>
			
 
				 #endif
			
 
				 
			
 
				+static int nb_devices = -1;
			
 
				+static int init_done = 0;
			
 
				+
			
 
				 static _starpu_pthread_mutex_t big_lock = _STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 static cl_context contexts[STARPU_MAXOPENCLDEVS];
			
 
				 static cl_device_id devices[STARPU_MAXOPENCLDEVS];
			
 
				 static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
			
 
				 static cl_command_queue transfer_queues[STARPU_MAXOPENCLDEVS];
			
 
				 static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
			
 
				-static cl_uint nb_devices = -1;
			
 
				-static int init_done = 0;
			
 
				+#endif
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				 void
			
 
				 _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
			
 
				 {
			
@@ -54,6 +56,8 @@ _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
 
				 	config->topology.nhwopenclgpus = nb_devices;
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 /* In case we want to cap the amount of memory available on the GPUs by the
			
 
				  * mean of the STARPU_LIMIT_GPU_MEM, we allocate a big buffer when the driver
			
 
				  * is launched. */
			
@@ -355,12 +359,16 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARP
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				 #endif
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				 
			
 
				 void _starpu_opencl_init(void)
			
 
				 {
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				         if (!init_done)
			
 
				 	{
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+		nb_devices = _starpu_simgrid_get_nbhosts("OpenCL");
			
 
				+#else /* STARPU_USE_OPENCL */
			
 
				                 cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
			
 
				                 cl_uint nb_platforms;
			
 
				                 cl_int err;
			
@@ -447,6 +455,7 @@ void _starpu_opencl_init(void)
 
				                         transfer_queues[i] = NULL;
			
 
				                         alloc_queues[i] = NULL;
			
 
				                 }
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				 
			
 
				                 init_done=1;
			
 
				         }
			
@@ -461,6 +470,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
				 static struct _starpu_worker*
			
 
				 _starpu_opencl_get_worker_from_driver(struct starpu_driver *d)
			
 
				 {
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 	int nworkers;
			
 
				 	int workers[STARPU_MAXOPENCLDEVS];
			
 
				 	nworkers = starpu_worker_get_ids_by_type(STARPU_OPENCL_WORKER, workers, STARPU_MAXOPENCLDEVS);
			
@@ -481,6 +491,22 @@ _starpu_opencl_get_worker_from_driver(struct starpu_driver *d)
 
				 		return NULL;
			
 
				 
			
 
				 	return _starpu_get_worker_struct(workers[i]);
			
 
				+#else
			
 
				+	unsigned nworkers = starpu_worker_get_count();
			
 
				+	unsigned  workerid;
			
 
				+	for (workerid = 0; workerid < nworkers; workerid++)
			
 
				+	{
			
 
				+		if (starpu_worker_get_type(workerid) == d->type)
			
 
				+		{
			
 
				+			struct _starpu_worker *worker;
			
 
				+			worker = _starpu_get_worker_struct(workerid);
			
 
				+			if (worker->devid == d->id.opencl_id)
			
 
				+				return worker;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return NULL;
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 int _starpu_opencl_driver_init(struct starpu_driver *d)
			
@@ -603,8 +629,9 @@ int _starpu_opencl_driver_deinit(struct starpu_driver *d)
 
				 
			
 
				 void *_starpu_opencl_worker(void *arg)
			
 
				 {
			
 
				-	cl_device_id id;
			
 
				 	struct _starpu_worker* args = arg;
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	cl_device_id id;
			
 
				 
			
 
				 	starpu_opencl_get_device(args->devid, &id);
			
 
				 	struct starpu_driver d =
			
@@ -612,6 +639,13 @@ void *_starpu_opencl_worker(void *arg)
 
				 			.type         = STARPU_OPENCL_WORKER,
			
 
				 			.id.opencl_id = id
			
 
				 		};
			
 
				+#else
			
 
				+	struct starpu_driver d =
			
 
				+		{
			
 
				+			.type         = STARPU_OPENCL_WORKER,
			
 
				+			.id.opencl_id = args->devid
			
 
				+		};
			
 
				+#endif
			
 
				 
			
 
				 	_starpu_opencl_driver_init(&d);
			
 
				 	while (_starpu_machine_is_running())
			
@@ -621,6 +655,7 @@ void *_starpu_opencl_worker(void *arg)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
			
 
				 {
			
@@ -639,16 +674,22 @@ static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
 
				 	return EXIT_SUCCESS;
			
 
				 }
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				 unsigned _starpu_opencl_get_device_count(void)
			
 
				 {
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				         if (!init_done)
			
 
				 	{
			
 
				                 _starpu_opencl_init();
			
 
				         }
			
 
				 	return nb_devices;
			
 
				+#else
			
 
				+	return _starpu_simgrid_get_nbhosts("OpenCL");
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 cl_device_type _starpu_opencl_get_device_type(int devid)
			
 
				 {
			
 
				 	int err;
			
@@ -663,6 +704,7 @@ cl_device_type _starpu_opencl_get_device_type(int devid)
 
				 
			
 
				 	return type;
			
 
				 }
			
 
				+#endif /* STARPU_USE_OPENCL */
			
 
				 
			
 
				 static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
			
 
				 {
			
@@ -723,6 +765,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
				 	return EXIT_SUCCESS;
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 int _starpu_run_opencl(struct starpu_driver *d)
			
 
				 {
			
 
				 	STARPU_ASSERT(d && d->type == STARPU_OPENCL_WORKER);
			
@@ -761,3 +804,4 @@ int _starpu_run_opencl(struct starpu_driver *d)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+#endif /* STARPU_USE_OPENCL */
			
--- a/src/drivers/opencl/driver_opencl.h
+++ b/src/drivers/opencl/driver_opencl.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -29,10 +29,17 @@
 
				 #else
			
 
				 #include <CL/cl.h>
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 struct _starpu_machine_config;
			
 
				 void
			
 
				 _starpu_opencl_discover_devices(struct _starpu_machine_config *config);
			
 
				+#else
			
 
				+#define _starpu_opencl_discover_devices(config) ((void) (config))
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 extern
			
 
				 char *_starpu_opencl_program_dir;
			
 
				 
			
@@ -41,10 +48,14 @@ int _starpu_opencl_init_context(int devid);
 
				 
			
 
				 extern
			
 
				 int _starpu_opencl_deinit_context(int devid);
			
 
				+#endif
			
 
				 
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 extern
			
 
				 unsigned _starpu_opencl_get_device_count(void);
			
 
				+#endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 extern
			
 
				 cl_device_type _starpu_opencl_get_device_type(int devid);
			
 
				 
			
@@ -59,13 +70,17 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node, cl_m
 
				                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
			
 
				                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 extern
			
 
				 void _starpu_opencl_init(void);
			
 
				 
			
 
				 extern
			
 
				 void *_starpu_opencl_worker(void *);
			
 
				+#endif
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 extern
			
 
				 int _starpu_run_opencl(struct starpu_driver *);
			
 
				 
			
@@ -77,7 +92,5 @@ int _starpu_opencl_driver_run_once(struct starpu_driver *);
 
				 
			
 
				 extern
			
 
				 int _starpu_opencl_driver_deinit(struct starpu_driver *);
			
 
				-#else
			
 
				-#define _starpu_opencl_discover_devices(config) ((void) (config))
			
 
				 #endif // STARPU_USE_OPENCL
			
 
				 #endif //  __DRIVER_OPENCL_H__