ソースを参照

rename HAVE_CUDA_MEMCPY_PEER to STARPU_HAVE_CUDA_MEMCPY_PEER and make it publicly available

Nathalie Furmento 7 年 前
コミット
65ee6fe4c3

+ 1 - 1
configure.ac

@@ -1384,7 +1384,7 @@ if test x$enable_cuda_memcpy_peer = xyes -a x$enable_cuda = xyes ; then
     LDFLAGS="${SAVED_LDFLAGS}"
 fi
 if test x$have_cuda_memcpy_peer = xyes; then
-    AC_DEFINE(HAVE_CUDA_MEMCPY_PEER,[1],[Peer transfers are supported in CUDA])
+    AC_DEFINE(STARPU_HAVE_CUDA_MEMCPY_PEER,[1],[Peer transfers are supported in CUDA])
 fi
 
 if test x$enable_cuda = xyes; then

+ 1 - 0
include/starpu_config.h.in

@@ -111,6 +111,7 @@
 #undef STARPU_SC_HYPERVISOR_DEBUG
 #undef STARPU_HAVE_GLPK_H
 
+#undef STARPU_HAVE_CUDA_MEMCPY_PEER
 #undef STARPU_HAVE_LIBNUMA
 
 #undef STARPU_HAVE_WINDOWS

+ 11 - 11
src/core/perfmodel/perfmodel_bus.c

@@ -97,7 +97,7 @@ static uint64_t cuda_size[STARPU_MAXCUDADEVS];
 static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXNUMANODES];
 
 #ifndef STARPU_SIMGRID
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
 static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
 #endif
@@ -284,7 +284,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	cudaThreadExit();
 }
 
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 {
 	size_t size = SIZE;
@@ -786,7 +786,7 @@ static void benchmark_all_gpu_devices(void)
 		/* measure bandwidth between Host and Device i */
 		measure_bandwidth_between_host_and_dev(i, cudadev_timing_per_numa, "CUDA");
 	}
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	for (i = 0; i < ncuda; i++)
 	{
 		for (j = 0; j < ncuda; j++)
@@ -1378,7 +1378,7 @@ static void write_bus_latency_file_content(void)
 				/* ---- End NUMA ---- */
 #ifdef STARPU_USE_CUDA
 				b_up += ncuda;
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 				if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
 					latency += cudadev_latency_dtod[src-b_low][dst-b_low];
 				else
@@ -1709,7 +1709,7 @@ static void write_bus_bandwidth_file_content(void)
 				/* End NUMA */
 #ifdef STARPU_USE_CUDA
 				b_up += ncuda;
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 				if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
 					/* Direct GPU-GPU transfert */
 					slowness += cudadev_timing_dtod[src-b_low][dst-b_low];
@@ -2180,7 +2180,7 @@ void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen)
  * - then through all CUDA-CUDA possible transfers again to emit routes.
  */
 
-#if defined(STARPU_USE_CUDA) && defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(HAVE_CUDA_MEMCPY_PEER)
+#if defined(STARPU_USE_CUDA) && defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 
 /* Records, for each PCI link and hub, the maximum bandwidth seen through it */
 struct pci_userdata
@@ -2652,7 +2652,7 @@ static void write_bus_platform_file_content(int version)
 	{
 		fprintf(f, "   <host id=\"CUDA%u\" %s=\"2000000000%s\">\n", i, speed, flops);
 		fprintf(f, "     <prop id=\"memsize\" value=\"%llu\"/>\n", (unsigned long long) cuda_size[i]);
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		fprintf(f, "     <prop id=\"memcpy_peer\" value=\"1\"/>\n");
 #endif
 		/* TODO: record cudadev_direct instead of assuming it's NUMA nodes */
@@ -2747,7 +2747,7 @@ static void write_bus_platform_file_content(int version)
 				search_bus_best_latency(i, "CUDA", 0)/1000000., s);
 	}
 	fprintf(f, "\n");
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	/* Write CUDA/CUDA bandwidths and latencies */
 	for (i = 0; i < ncuda; i++)
 	{
@@ -2768,7 +2768,7 @@ static void write_bus_platform_file_content(int version)
 	}
 #endif
 
-#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(HAVE_CUDA_MEMCPY_PEER)
+#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 	/* If we have enough hwloc information, write PCI bandwidths and routes */
 	if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0))
 	{
@@ -2840,7 +2840,7 @@ flat_cuda:
 			fprintf(f, "   <route src=\"RAM\" dst=\"%s\" symmetrical=\"NO\"><link_ctn id=\"RAM-%s\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
 			fprintf(f, "   <route src=\"%s\" dst=\"RAM\" symmetrical=\"NO\"><link_ctn id=\"%s-RAM\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
 		}
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		for (i = 0; i < ncuda; i++)
 		{
                         unsigned j;
@@ -2856,7 +2856,7 @@ flat_cuda:
 			}
 		}
 #endif
-	} /* defined(STARPU_HAVE_HWLOC) && defined(HAVE_CUDA_MEMCPY_PEER) */
+	} /* defined(STARPU_HAVE_HWLOC) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) */
 	fprintf(f, "\n");
 #endif /* STARPU_USE_CUDA */
 

+ 1 - 1
src/core/topology.c

@@ -2328,7 +2328,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 					if (
 #ifdef STARPU_SIMGRID
 						cuda_memcpy_peer && atoll(cuda_memcpy_peer)
-#elif defined(HAVE_CUDA_MEMCPY_PEER)
+#elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 						1
 #else /* MEMCPY_PEER */
 						0

+ 4 - 4
src/datawizard/coherency.c

@@ -243,14 +243,14 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 			}
 			else
 				return 0;
-#elif defined(HAVE_CUDA_MEMCPY_PEER)
+#elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 			/* simgrid */
 			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
 			return kind == STARPU_CUDA_RAM;
-#else /* HAVE_CUDA_MEMCPY_PEER */
+#else /* STARPU_HAVE_CUDA_MEMCPY_PEER */
 			/* Direct GPU-GPU transfers are not allowed in general */
 			return 0;
-#endif /* HAVE_CUDA_MEMCPY_PEER */
+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
 		}
 		case STARPU_OPENCL_RAM:
 			return 0;
@@ -425,7 +425,7 @@ static int determine_request_path(starpu_data_handle_t handle,
 		dst_nodes[0] = dst_node;
 		handling_nodes[0] = handling_node;
 
-#if !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 		STARPU_ASSERT(!(mode & STARPU_R) || starpu_node_get_kind(src_node) != STARPU_CUDA_RAM || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM);
 #endif
 

+ 4 - 4
src/datawizard/copy_driver.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2013, 2016, 2017  CNRS
  * Copyright (C) 2016-2017  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -177,7 +177,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	void *src_interface = src_replicate->data_interface;
 	void *dst_interface = dst_replicate->data_interface;
 
-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
 	{
 		unsigned devid;
@@ -207,7 +207,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #ifdef STARPU_USE_CUDA
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
 		/* only the proper CUBLAS thread can initiate this directly ! */
-#if !defined(HAVE_CUDA_MEMCPY_PEER)
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
 #endif
 		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
@@ -242,7 +242,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
 		/* STARPU_CPU_RAM -> CUBLAS_RAM */
 		/* only the proper CUBLAS thread can initiate this ! */
-#if !defined(HAVE_CUDA_MEMCPY_PEER)
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
 #endif
 		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||

+ 1 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -444,7 +444,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 #ifndef BUGGED_MEMCPY3D
 static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
 {
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	struct starpu_matrix_interface *src_matrix = src_interface;
 	struct starpu_matrix_interface *dst_matrix = dst_interface;
 

+ 3 - 3
src/datawizard/interfaces/multiformat_interface.c

@@ -505,7 +505,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_
 	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 				void *dst_interface, unsigned dst_node,
 				cudaStream_t stream)
@@ -559,7 +559,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 	}
 	else
 	{
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		return copy_cuda_peer_common(src_interface, src_node,
 					     dst_interface, dst_node,
 					     NULL);
@@ -581,7 +581,7 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
 	}
 	else
 	{
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		return copy_cuda_peer_common(src_interface, src_node,
 					     dst_interface, dst_node,
 					     stream);

+ 17 - 17
src/datawizard/malloc.c

@@ -64,7 +64,7 @@ void starpu_malloc_set_align(size_t align)
 		_malloc_align = align;
 }
 
-#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
+#if (defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
 struct malloc_pinned_codelet_struct
 {
 	void **ptr;
@@ -83,7 +83,7 @@ struct malloc_pinned_codelet_struct
 //}
 //#endif
 
-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 {
 	struct malloc_pinned_codelet_struct *s = arg;
@@ -95,7 +95,7 @@ static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED,
 }
 #endif
 
-#if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)) && !defined(STARPU_SIMGRID)// || defined(STARPU_USE_OPENCL)
+#if (defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)) && !defined(STARPU_SIMGRID)// || defined(STARPU_USE_OPENCL)
 static struct starpu_perfmodel malloc_pinned_model =
 {
 	.type = STARPU_HISTORY_BASED,
@@ -162,7 +162,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 				MSG_process_sleep((float) dim * 0.000650 / 1048576.);
 #else /* STARPU_SIMGRID */
 #ifdef STARPU_USE_CUDA
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 			cudaError_t cures;
 			cures = cudaHostAlloc(A, dim, cudaHostAllocPortable);
 			if (STARPU_UNLIKELY(cures))
@@ -198,7 +198,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 			push_res = _starpu_task_submit_internally(task);
 			STARPU_ASSERT(push_res != -ENODEV);
 			goto end;
-#endif /* HAVE_CUDA_MEMCPY_PEER */
+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
 #endif /* STARPU_USE_CUDA */
 //		}
 //		else if (_starpu_can_submit_opencl_task())
@@ -368,7 +368,7 @@ int starpu_malloc(void **A, size_t dim)
 	return starpu_malloc_flags(A, dim, STARPU_MALLOC_PINNED);
 }
 
-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 {
 	cudaError_t cures;
@@ -387,7 +387,7 @@ static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, vo
 //}
 //#endif
 
-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID) // || defined(STARPU_USE_OPENCL)
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID) // || defined(STARPU_USE_OPENCL)
 static struct starpu_perfmodel free_pinned_model =
 {
 	.type = STARPU_HISTORY_BASED,
@@ -420,7 +420,7 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 			/* TODO: simulate CUDA barrier */
 #else /* !STARPU_SIMGRID */
 #ifdef STARPU_USE_CUDA
-#ifndef HAVE_CUDA_MEMCPY_PEER
+#ifndef STARPU_HAVE_CUDA_MEMCPY_PEER
 			if (!starpu_is_initialized())
 			{
 #endif
@@ -431,7 +431,7 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 				if (STARPU_UNLIKELY(err))
 					STARPU_CUDA_REPORT_ERROR(err);
 				goto out;
-#ifndef HAVE_CUDA_MEMCPY_PEER
+#ifndef STARPU_HAVE_CUDA_MEMCPY_PEER
 			}
 			else
 			{
@@ -453,7 +453,7 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 				STARPU_ASSERT(push_res != -ENODEV);
 				goto out;
 			}
-#endif /* HAVE_CUDA_MEMCPY_PEER */
+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
 #endif /* STARPU_USE_CUDA */
 #endif /* STARPU_SIMGRID */
 		}
@@ -553,8 +553,8 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 	{
 		case STARPU_CPU_RAM:
 		{
-			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,			
-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 					/* without memcpy_peer, we can not
 					 * allocated pinned memory, since it
 					 * requires waiting for a task, and we
@@ -587,7 +587,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 			unsigned devid = _starpu_memory_node_get_devid(dst_node);
-#if defined(HAVE_CUDA_MEMCPY_PEER)
+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 			starpu_cuda_set_device(devid);
 #else
 			struct _starpu_worker *worker = _starpu_get_local_worker_key();
@@ -684,7 +684,7 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 	{
 		case STARPU_CPU_RAM:
 			_starpu_free_flags_on_node(dst_node, (void*)addr, size,
-#if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 					flags & ~STARPU_MALLOC_PINNED
 #else
 					flags
@@ -705,7 +705,7 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 #else
 			cudaError_t err;
 			unsigned devid = _starpu_memory_node_get_devid(dst_node);
-#if defined(HAVE_CUDA_MEMCPY_PEER)
+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 			starpu_cuda_set_device(devid);
 #else
 			struct _starpu_worker *worker = _starpu_get_local_worker_key();
@@ -781,7 +781,7 @@ starpu_memory_pin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBU
 {
 	if (STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 		if (cudaHostRegister(addr, size, cudaHostRegisterPortable) != cudaSuccess)
 			return -1;
 #endif
@@ -794,7 +794,7 @@ starpu_memory_unpin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRI
 {
 	if (STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
 		if (cudaHostUnregister(addr) != cudaSuccess)
 			return -1;
 #endif

+ 2 - 2
src/datawizard/memalloc.c

@@ -366,7 +366,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 		if (handle)
 			STARPU_ASSERT(replicate->allocated);
 
-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 		if (starpu_node_get_kind(node) == STARPU_CUDA_RAM)
 		{
 			/* To facilitate the design of interface, we set the
@@ -1354,7 +1354,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	{
 		_STARPU_TRACE_START_ALLOC(dst_node, data_size);
 
-#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 		if (starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 		{
 			/* To facilitate the design of interface, we set the

+ 2 - 2
src/datawizard/reduction.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014, 2016  Université de Bordeaux
- * Copyright (C) 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -67,7 +67,7 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 			break;
 		case STARPU_CUDA_WORKER:
 			init_func = _starpu_task_get_cuda_nth_implementation(init_cl, 0);
-#if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 			/* We make sure we do manipulate the proper device */
 			starpu_cuda_set_device(starpu_worker_get_devid(workerid));
 #endif

+ 10 - 10
src/drivers/cuda/driver_cuda.c

@@ -233,11 +233,11 @@ void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
 #else
 	cudaError_t cures;
 	struct starpu_conf *conf = &_starpu_get_machine_config()->conf;
-#if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
 	unsigned i;
 #endif
 
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	if (conf->n_cuda_opengl_interoperability)
 	{
 		_STARPU_MSG("OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
@@ -262,7 +262,7 @@ void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
 
 	cures = cudaSetDevice(devid);
 
-#if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
 done:
 #endif
 	if (STARPU_UNLIKELY(cures
@@ -306,7 +306,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
 
 #ifndef STARPU_SIMGRID
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
 	{
 		int nworkers = starpu_worker_get_count();
@@ -348,7 +348,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 	cures = cudaGetDeviceProperties(&props[devid], devid);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	if (props[devid].computeMode == cudaComputeModeExclusive)
 	{
 		_STARPU_MSG("CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
@@ -494,7 +494,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 		_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, profiling);
 	}
 
-#if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */
 	starpu_cuda_set_device(worker->devid);
 #endif
@@ -1074,7 +1074,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 			    size_t ssize, cudaStream_t stream,
 			    enum cudaMemcpyKind kind)
 {
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	int peer_copy = 0;
 	int src_dev = -1, dst_dev = -1;
 #endif
@@ -1082,7 +1082,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
 	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
 	{
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		peer_copy = 1;
 		src_dev = _starpu_memory_node_get_devid(src_node);
 		dst_dev = _starpu_memory_node_get_devid(dst_node);
@@ -1094,7 +1094,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 	if (stream)
 	{
 		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		if (peer_copy)
 		{
 			cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
@@ -1113,7 +1113,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 	if (stream == NULL || cures)
 	{
 		/* do it in a synchronous fashion */
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		if (peer_copy)
 		{
 			cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,

+ 2 - 2
tests/datawizard/gpu_ptr_register.c

@@ -85,7 +85,7 @@ check_result(unsigned *t, size_t size)
 }
 
 #ifdef STARPU_USE_CUDA
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 static int
 test_cuda(void)
 {
@@ -265,7 +265,7 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef STARPU_USE_CUDA
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	if (ret == 1)
 		goto fail;

+ 2 - 2
tests/datawizard/gpu_register.c

@@ -84,7 +84,7 @@ check_result(unsigned *t, size_t size)
 }
 
 #ifdef STARPU_USE_CUDA
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 static int
 test_cuda(void)
 {
@@ -287,7 +287,7 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef STARPU_USE_CUDA
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	ret = test_cuda();
 	if (ret == 1)
 		goto fail;

+ 4 - 4
tests/datawizard/interfaces/test_interfaces.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011  INRIA
- * Copyright (C) 2013, 2014  CNRS
+ * Copyright (C) 2013, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -457,7 +457,7 @@ ram_to_cuda(void)
 	return current_config->copy_failed;
 }
 
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 static enum exit_code
 cuda_to_cuda(void)
 {
@@ -585,7 +585,7 @@ run_cuda(int async)
 	if (err != SUCCESS)
 		return;
 
-#ifdef HAVE_CUDA_MEMCPY_PEER
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 	if (starpu_cuda_worker_get_count() >= 2)
 	{
 		err = cuda_to_cuda();
@@ -599,7 +599,7 @@ run_cuda(int async)
 	}
 #else
 	summary.cuda_to_cuda_async = UNTESTED;
-#endif /* !HAVE_CUDA_MEMCPY_PEER */
+#endif /* !STARPU_HAVE_CUDA_MEMCPY_PEER */
 
 #ifdef STARPU_USE_CPU
 	err = cuda_to_ram();