8 years ago · 65ee6fe4c3
--- a/configure.ac
+++ b/configure.ac
@@ -1384,7 +1384,7 @@ if test x$enable_cuda_memcpy_peer = xyes -a x$enable_cuda = xyes ; then
 
				     LDFLAGS="${SAVED_LDFLAGS}"
			
 
				 fi
			
 
				 if test x$have_cuda_memcpy_peer = xyes; then
			
 
				-    AC_DEFINE(HAVE_CUDA_MEMCPY_PEER,[1],[Peer transfers are supported in CUDA])
			
 
				+    AC_DEFINE(STARPU_HAVE_CUDA_MEMCPY_PEER,[1],[Peer transfers are supported in CUDA])
			
 
				 fi
			
 
				 
			
 
				 if test x$enable_cuda = xyes; then
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -111,6 +111,7 @@
 
				 #undef STARPU_SC_HYPERVISOR_DEBUG
			
 
				 #undef STARPU_HAVE_GLPK_H
			
 
				 
			
 
				+#undef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 #undef STARPU_HAVE_LIBNUMA
			
 
				 
			
 
				 #undef STARPU_HAVE_WINDOWS
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -97,7 +97,7 @@ static uint64_t cuda_size[STARPU_MAXCUDADEVS];
 
				 static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXNUMANODES];
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
			
 
				 static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
			
 
				 #endif
			
@@ -284,7 +284,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	cudaThreadExit();
			
 
				 }
			
 
				 
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
			
 
				 {
			
 
				 	size_t size = SIZE;
			
@@ -786,7 +786,7 @@ static void benchmark_all_gpu_devices(void)
 
				 		/* measure bandwidth between Host and Device i */
			
 
				 		measure_bandwidth_between_host_and_dev(i, cudadev_timing_per_numa, "CUDA");
			
 
				 	}
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	for (i = 0; i < ncuda; i++)
			
 
				 	{
			
 
				 		for (j = 0; j < ncuda; j++)
			
@@ -1378,7 +1378,7 @@ static void write_bus_latency_file_content(void)
 
				 				/* ---- End NUMA ---- */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 				b_up += ncuda;
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 				if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
			
 
				 					latency += cudadev_latency_dtod[src-b_low][dst-b_low];
			
 
				 				else
			
@@ -1709,7 +1709,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 				/* End NUMA */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 				b_up += ncuda;
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 				if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
			
 
				 					/* Direct GPU-GPU transfert */
			
 
				 					slowness += cudadev_timing_dtod[src-b_low][dst-b_low];
			
@@ -2180,7 +2180,7 @@ void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen)
 
				  * - then through all CUDA-CUDA possible transfers again to emit routes.
			
 
				  */
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) && defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#if defined(STARPU_USE_CUDA) && defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 
			
 
				 /* Records, for each PCI link and hub, the maximum bandwidth seen through it */
			
 
				 struct pci_userdata
			
@@ -2652,7 +2652,7 @@ static void write_bus_platform_file_content(int version)
 
				 	{
			
 
				 		fprintf(f, "   <host id=\"CUDA%u\" %s=\"2000000000%s\">\n", i, speed, flops);
			
 
				 		fprintf(f, "     <prop id=\"memsize\" value=\"%llu\"/>\n", (unsigned long long) cuda_size[i]);
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 		fprintf(f, "     <prop id=\"memcpy_peer\" value=\"1\"/>\n");
			
 
				 #endif
			
 
				 		/* TODO: record cudadev_direct instead of assuming it's NUMA nodes */
			
@@ -2747,7 +2747,7 @@ static void write_bus_platform_file_content(int version)
 
				 				search_bus_best_latency(i, "CUDA", 0)/1000000., s);
			
 
				 	}
			
 
				 	fprintf(f, "\n");
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	/* Write CUDA/CUDA bandwidths and latencies */
			
 
				 	for (i = 0; i < ncuda; i++)
			
 
				 	{
			
@@ -2768,7 +2768,7 @@ static void write_bus_platform_file_content(int version)
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 	/* If we have enough hwloc information, write PCI bandwidths and routes */
			
 
				 	if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0))
			
 
				 	{
			
@@ -2840,7 +2840,7 @@ flat_cuda:
 
				 			fprintf(f, "   <route src=\"RAM\" dst=\"%s\" symmetrical=\"NO\"><link_ctn id=\"RAM-%s\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
			
 
				 			fprintf(f, "   <route src=\"%s\" dst=\"RAM\" symmetrical=\"NO\"><link_ctn id=\"%s-RAM\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
			
 
				 		}
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 		for (i = 0; i < ncuda; i++)
			
 
				 		{
			
 
				                         unsigned j;
			
@@ -2856,7 +2856,7 @@ flat_cuda:
 
				 			}
			
 
				 		}
			
 
				 #endif
			
 
				-	} /* defined(STARPU_HAVE_HWLOC) && defined(HAVE_CUDA_MEMCPY_PEER) */
			
 
				+	} /* defined(STARPU_HAVE_HWLOC) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) */
			
 
				 	fprintf(f, "\n");
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -2328,7 +2328,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 					if (
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 						cuda_memcpy_peer && atoll(cuda_memcpy_peer)
			
 
				-#elif defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 						1
			
 
				 #else /* MEMCPY_PEER */
			
 
				 						0
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -243,14 +243,14 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 			}
			
 
				 			else
			
 
				 				return 0;
			
 
				-#elif defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 			/* simgrid */
			
 
				 			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
			
 
				 			return kind == STARPU_CUDA_RAM;
			
 
				-#else /* HAVE_CUDA_MEMCPY_PEER */
			
 
				+#else /* STARPU_HAVE_CUDA_MEMCPY_PEER */
			
 
				 			/* Direct GPU-GPU transfers are not allowed in general */
			
 
				 			return 0;
			
 
				-#endif /* HAVE_CUDA_MEMCPY_PEER */
			
 
				+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
			
 
				 		}
			
 
				 		case STARPU_OPENCL_RAM:
			
 
				 			return 0;
			
@@ -425,7 +425,7 @@ static int determine_request_path(starpu_data_handle_t handle,
 
				 		dst_nodes[0] = dst_node;
			
 
				 		handling_nodes[0] = handling_node;
			
 
				 
			
 
				-#if !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 		STARPU_ASSERT(!(mode & STARPU_R) || starpu_node_get_kind(src_node) != STARPU_CUDA_RAM || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM);
			
 
				 #endif
			
 
				 
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2013, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2013, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2016-2017  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -177,7 +177,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 	void *src_interface = src_replicate->data_interface;
			
 
				 	void *dst_interface = dst_replicate->data_interface;
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
			
 
				 	{
			
 
				 		unsigned devid;
			
@@ -207,7 +207,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
			
 
				 		/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				-#if !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
			
 
				 #endif
			
 
				 		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
			
@@ -242,7 +242,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
			
 
				 		/* STARPU_CPU_RAM -> CUBLAS_RAM */
			
 
				 		/* only the proper CUBLAS thread can initiate this ! */
			
 
				-#if !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
			
 
				 #endif
			
 
				 		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -444,7 +444,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 #ifndef BUGGED_MEMCPY3D
			
 
				 static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
			
 
				 {
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	struct starpu_matrix_interface *src_matrix = src_interface;
			
 
				 	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				 
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -505,7 +505,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_
 
				 	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
			
 
				 				void *dst_interface, unsigned dst_node,
			
 
				 				cudaStream_t stream)
			
@@ -559,7 +559,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 		return copy_cuda_peer_common(src_interface, src_node,
			
 
				 					     dst_interface, dst_node,
			
 
				 					     NULL);
			
@@ -581,7 +581,7 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 		return copy_cuda_peer_common(src_interface, src_node,
			
 
				 					     dst_interface, dst_node,
			
 
				 					     stream);
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -64,7 +64,7 @@ void starpu_malloc_set_align(size_t align)
 
				 		_malloc_align = align;
			
 
				 }
			
 
				 
			
 
				-#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
			
 
				+#if (defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
			
 
				 struct malloc_pinned_codelet_struct
			
 
				 {
			
 
				 	void **ptr;
			
@@ -83,7 +83,7 @@ struct malloc_pinned_codelet_struct
 
				 //}
			
 
				 //#endif
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 {
			
 
				 	struct malloc_pinned_codelet_struct *s = arg;
			
@@ -95,7 +95,7 @@ static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)) && !defined(STARPU_SIMGRID)// || defined(STARPU_USE_OPENCL)
			
 
				+#if (defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)) && !defined(STARPU_SIMGRID)// || defined(STARPU_USE_OPENCL)
			
 
				 static struct starpu_perfmodel malloc_pinned_model =
			
 
				 {
			
 
				 	.type = STARPU_HISTORY_BASED,
			
@@ -162,7 +162,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
				 				MSG_process_sleep((float) dim * 0.000650 / 1048576.);
			
 
				 #else /* STARPU_SIMGRID */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 			cudaError_t cures;
			
 
				 			cures = cudaHostAlloc(A, dim, cudaHostAllocPortable);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
@@ -198,7 +198,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
				 			push_res = _starpu_task_submit_internally(task);
			
 
				 			STARPU_ASSERT(push_res != -ENODEV);
			
 
				 			goto end;
			
 
				-#endif /* HAVE_CUDA_MEMCPY_PEER */
			
 
				+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 //		}
			
 
				 //		else if (_starpu_can_submit_opencl_task())
			
@@ -368,7 +368,7 @@ int starpu_malloc(void **A, size_t dim)
 
				 	return starpu_malloc_flags(A, dim, STARPU_MALLOC_PINNED);
			
 
				 }
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 {
			
 
				 	cudaError_t cures;
			
@@ -387,7 +387,7 @@ static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, vo
 
				 //}
			
 
				 //#endif
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID) // || defined(STARPU_USE_OPENCL)
			
 
				+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID) // || defined(STARPU_USE_OPENCL)
			
 
				 static struct starpu_perfmodel free_pinned_model =
			
 
				 {
			
 
				 	.type = STARPU_HISTORY_BASED,
			
@@ -420,7 +420,7 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 
				 			/* TODO: simulate CUDA barrier */
			
 
				 #else /* !STARPU_SIMGRID */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifndef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 			if (!starpu_is_initialized())
			
 
				 			{
			
 
				 #endif
			
@@ -431,7 +431,7 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 
				 				if (STARPU_UNLIKELY(err))
			
 
				 					STARPU_CUDA_REPORT_ERROR(err);
			
 
				 				goto out;
			
 
				-#ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifndef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 			}
			
 
				 			else
			
 
				 			{
			
@@ -453,7 +453,7 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 
				 				STARPU_ASSERT(push_res != -ENODEV);
			
 
				 				goto out;
			
 
				 			}
			
 
				-#endif /* HAVE_CUDA_MEMCPY_PEER */
			
 
				+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 #endif /* STARPU_SIMGRID */
			
 
				 		}
			
@@ -553,8 +553,8 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 		{
			
 
				-			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,			
			
 
				-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,
			
 
				+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 					/* without memcpy_peer, we can not
			
 
				 					 * allocated pinned memory, since it
			
 
				 					 * requires waiting for a task, and we
			
@@ -587,7 +587,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
			
 
				 #else
			
 
				 			unsigned devid = _starpu_memory_node_get_devid(dst_node);
			
 
				-#if defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 			starpu_cuda_set_device(devid);
			
 
				 #else
			
 
				 			struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
@@ -684,7 +684,7 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			_starpu_free_flags_on_node(dst_node, (void*)addr, size,
			
 
				-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 					flags & ~STARPU_MALLOC_PINNED
			
 
				 #else
			
 
				 					flags
			
@@ -705,7 +705,7 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 
				 #else
			
 
				 			cudaError_t err;
			
 
				 			unsigned devid = _starpu_memory_node_get_devid(dst_node);
			
 
				-#if defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 			starpu_cuda_set_device(devid);
			
 
				 #else
			
 
				 			struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
@@ -781,7 +781,7 @@ starpu_memory_pin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBU
 
				 {
			
 
				 	if (STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
			
 
				 	{
			
 
				-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 		if (cudaHostRegister(addr, size, cudaHostRegisterPortable) != cudaSuccess)
			
 
				 			return -1;
			
 
				 #endif
			
@@ -794,7 +794,7 @@ starpu_memory_unpin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRI
 
				 {
			
 
				 	if (STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
			
 
				 	{
			
 
				-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
			
 
				 		if (cudaHostUnregister(addr) != cudaSuccess)
			
 
				 			return -1;
			
 
				 #endif
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -366,7 +366,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 
				 		if (handle)
			
 
				 			STARPU_ASSERT(replicate->allocated);
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 		if (starpu_node_get_kind(node) == STARPU_CUDA_RAM)
			
 
				 		{
			
 
				 			/* To facilitate the design of interface, we set the
			
@@ -1354,7 +1354,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 	{
			
 
				 		_STARPU_TRACE_START_ALLOC(dst_node, data_size);
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 		if (starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				 		{
			
 
				 			/* To facilitate the design of interface, we set the
			
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2014, 2016  Université de Bordeaux
			
 
				- * Copyright (C) 2011, 2012, 2013, 2016  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -67,7 +67,7 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 
				 			break;
			
 
				 		case STARPU_CUDA_WORKER:
			
 
				 			init_func = _starpu_task_get_cuda_nth_implementation(init_cl, 0);
			
 
				-#if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 			/* We make sure we do manipulate the proper device */
			
 
				 			starpu_cuda_set_device(starpu_worker_get_devid(workerid));
			
 
				 #endif
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -233,11 +233,11 @@ void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
 
				 #else
			
 
				 	cudaError_t cures;
			
 
				 	struct starpu_conf *conf = &_starpu_get_machine_config()->conf;
			
 
				-#if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
			
 
				+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
			
 
				 	unsigned i;
			
 
				 #endif
			
 
				 
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	if (conf->n_cuda_opengl_interoperability)
			
 
				 	{
			
 
				 		_STARPU_MSG("OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
			
@@ -262,7 +262,7 @@ void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
 
				 
			
 
				 	cures = cudaSetDevice(devid);
			
 
				 
			
 
				-#if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
			
 
				+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
			
 
				 done:
			
 
				 #endif
			
 
				 	if (STARPU_UNLIKELY(cures
			
@@ -306,7 +306,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
			
 
				 	{
			
 
				 		int nworkers = starpu_worker_get_count();
			
@@ -348,7 +348,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 
				 	cures = cudaGetDeviceProperties(&props[devid], devid);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	if (props[devid].computeMode == cudaComputeModeExclusive)
			
 
				 	{
			
 
				 		_STARPU_MSG("CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
			
@@ -494,7 +494,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
				 		_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, profiling);
			
 
				 	}
			
 
				 
			
 
				-#if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 	/* We make sure we do manipulate the proper device */
			
 
				 	starpu_cuda_set_device(worker->devid);
			
 
				 #endif
			
@@ -1074,7 +1074,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
				 			    size_t ssize, cudaStream_t stream,
			
 
				 			    enum cudaMemcpyKind kind)
			
 
				 {
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	int peer_copy = 0;
			
 
				 	int src_dev = -1, dst_dev = -1;
			
 
				 #endif
			
@@ -1082,7 +1082,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
				 
			
 
				 	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
			
 
				 	{
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 		peer_copy = 1;
			
 
				 		src_dev = _starpu_memory_node_get_devid(src_node);
			
 
				 		dst_dev = _starpu_memory_node_get_devid(dst_node);
			
@@ -1094,7 +1094,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
				 	if (stream)
			
 
				 	{
			
 
				 		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 		if (peer_copy)
			
 
				 		{
			
 
				 			cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
			
@@ -1113,7 +1113,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
				 	if (stream == NULL || cures)
			
 
				 	{
			
 
				 		/* do it in a synchronous fashion */
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 		if (peer_copy)
			
 
				 		{
			
 
				 			cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,
			
--- a/tests/datawizard/gpu_ptr_register.c
+++ b/tests/datawizard/gpu_ptr_register.c
@@ -85,7 +85,7 @@ check_result(unsigned *t, size_t size)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 static int
			
 
				 test_cuda(void)
			
 
				 {
			
@@ -265,7 +265,7 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	ret = test_cuda();
			
 
				 	if (ret == 1)
			
 
				 		goto fail;
			
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -84,7 +84,7 @@ check_result(unsigned *t, size_t size)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 static int
			
 
				 test_cuda(void)
			
 
				 {
			
@@ -287,7 +287,7 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	ret = test_cuda();
			
 
				 	if (ret == 1)
			
 
				 		goto fail;
			
--- a/tests/datawizard/interfaces/test_interfaces.c
+++ b/tests/datawizard/interfaces/test_interfaces.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011  INRIA
			
 
				- * Copyright (C) 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2013, 2014, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -457,7 +457,7 @@ ram_to_cuda(void)
 
				 	return current_config->copy_failed;
			
 
				 }
			
 
				 
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 static enum exit_code
			
 
				 cuda_to_cuda(void)
			
 
				 {
			
@@ -585,7 +585,7 @@ run_cuda(int async)
 
				 	if (err != SUCCESS)
			
 
				 		return;
			
 
				 
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				 	if (starpu_cuda_worker_get_count() >= 2)
			
 
				 	{
			
 
				 		err = cuda_to_cuda();
			
@@ -599,7 +599,7 @@ run_cuda(int async)
 
				 	}
			
 
				 #else
			
 
				 	summary.cuda_to_cuda_async = UNTESTED;
			
 
				-#endif /* !HAVE_CUDA_MEMCPY_PEER */
			
 
				+#endif /* !STARPU_HAVE_CUDA_MEMCPY_PEER */
			
 
				 
			
 
				 #ifdef STARPU_USE_CPU
			
 
				 	err = cuda_to_ram();