Browse Source

Catch users who do not properly register their data to CUDA

Samuel Thibault 7 years ago
parent
commit
7b226ea5f6

+ 5 - 0
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -79,6 +79,11 @@ link to \ref StaticScheduling
 
 \section CUDA-specificOptimizations CUDA-specific Optimizations
 
+For proper overlapping of asynchronous GPU data transfers, data has to be pinned
+by CUDA. Data allocated with starpu_malloc() is always properly pinned. If the
+application is registering to StarPU some data which has not been allocated with
+starpu_malloc(), it should use starpu_memory_pin() to pin it.
+
 Due to CUDA limitations, StarPU will have a hard time overlapping its own
 communications and the codelet computations if the application does not use a
 dedicated CUDA stream for its computations instead of the default stream,

+ 16 - 1
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2011-2014,2017                           Inria
- * Copyright (C) 2009-2011,2014-2017                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014-2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -1212,6 +1212,21 @@ the starpu_data_copy_methods::any_to_any copy method, which is provided with \p
 be passed to starpu_interface_copy(). this returns <c>-EAGAIN</c> if the
 transfer is still ongoing, or 0 if the transfer is already completed.
 
+\fn void starpu_interface_start_driver_copy_async(unsigned src_node, unsigned dst_node, double *start)
+\ingroup API_Data_Interfaces
+When an asynchonous implementation of the data transfer is implemented, the call
+to the underlying CUDA, OpenCL, etc. call should be surrounded
+by calls to starpu_interface_start_driver_copy_async() and
+starpu_interface_end_driver_copy_async(), so that it is recorded in offline
+execution traces, and the timing of the submission is checked. \p start must
+point to a variable whose value will be passed unchanged to
+starpu_interface_end_driver_copy_async().
+
+\fn void starpu_interface_end_driver_copy_async(unsigned src_node, unsigned dst_node, double start)
+\ingroup API_Data_Interfaces
+See starpu_interface_start_driver_copy_async().
+
+
 \fn uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc)
 \ingroup API_Data_Interfaces
 Compute the CRC of a byte buffer seeded by the \p inputcrc

+ 3 - 1
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013                                Inria
- * Copyright (C) 2009-2016                                Université de Bordeaux
+ * Copyright (C) 2009-2016,2018                           Université de Bordeaux
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -151,6 +151,7 @@ int main(void)
 	 *  - the fifth argument is the size of each element.
 	 */
 	starpu_data_handle_t vector_handle;
+	starpu_memory_pin(vector, sizeof(vector));
 	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 	float factor = 3.14;
@@ -178,6 +179,7 @@ int main(void)
 	/* StarPU does not need to manipulate the array anymore so we can stop
  	 * monitoring it */
 	starpu_data_unregister(vector_handle);
+	starpu_memory_unpin(vector, sizeof(vector));
 
 #ifdef STARPU_USE_OPENCL
         ret = starpu_opencl_unload_opencl(&opencl_program);

+ 3 - 1
include/starpu_data_interfaces.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2014,2016-2017                      Inria
- * Copyright (C) 2009-2016                                Université de Bordeaux
+ * Copyright (C) 2009-2016,2018                           Université de Bordeaux
  * Copyright (C) 2010-2015,2017                           CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -94,6 +94,8 @@ struct starpu_data_copy_methods
 };
 
 int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
+void starpu_interface_start_driver_copy_async(unsigned src_node, unsigned dst_node, double *start);
+void starpu_interface_end_driver_copy_async(unsigned src_node, unsigned dst_node, double start);
 uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags);
 uintptr_t starpu_malloc_on_node(unsigned dst_node, size_t size);
 void starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags);

+ 13 - 9
src/core/disk.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2015-2017                                CNRS
  * Copyright (C) 2013,2017                                Inria
- * Copyright (C) 2013-2015,2017                           Université de Bordeaux
+ * Copyright (C) 2013-2015,2017-2018                      Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -181,11 +181,12 @@ int _starpu_disk_read(unsigned src_node, unsigned dst_node STARPU_ATTRIBUTE_UNUS
 			channel = NULL;
 		else
 		{
+			double start;
 			channel->event.disk_event.memory_node = src_node;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			event = disk_register_list[src_node]->functions->async_read(disk_register_list[src_node]->base, obj, buf, offset, size);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
                         add_async_event(channel, event);
 		}
@@ -210,11 +211,12 @@ int _starpu_disk_write(unsigned src_node STARPU_ATTRIBUTE_UNUSED, unsigned dst_n
 			channel = NULL;
 		else
                 {
+			double start;
 			channel->event.disk_event.memory_node = dst_node;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			event = disk_register_list[dst_node]->functions->async_write(disk_register_list[dst_node]->base, obj, buf, offset, size);
-        		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
                         add_async_event(channel, event);
 		}
@@ -276,11 +278,12 @@ int _starpu_disk_full_read(unsigned src_node, unsigned dst_node, void *obj, void
 			channel = NULL;
 		else
 		{
+			double start;
 			channel->event.disk_event.memory_node = src_node;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			event = disk_register_list[src_node]->functions->async_full_read(disk_register_list[src_node]->base, obj, ptr, size, dst_node);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
                         add_async_event(channel, event);
 		}
@@ -304,11 +307,12 @@ int _starpu_disk_full_write(unsigned src_node STARPU_ATTRIBUTE_UNUSED, unsigned
 			channel = NULL;
 		else
 		{
+			double start;
 			channel->event.disk_event.memory_node = dst_node;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			event = disk_register_list[dst_node]->functions->async_full_write(disk_register_list[dst_node]->base, obj, ptr, size);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
                         add_async_event(channel, event);
 		}

+ 3 - 2
src/core/simgrid.c

@@ -912,6 +912,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 	double *computation;
 	double *communication;
 	union _starpu_async_channel_event *event, myevent;
+	double start = 0.;
 
 	_STARPU_CALLOC(hosts, 2, sizeof(*hosts));
 	_STARPU_CALLOC(computation, 2, sizeof(*computation));
@@ -947,7 +948,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 	transfer->next = NULL;
 
 	if (req)
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 
 	/* Sleep 10µs for the GPU transfer queueing */
 	if (_starpu_simgrid_queue_malloc_cost())
@@ -957,7 +958,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 
 	if (req)
 	{
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 		_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
 		return -EAGAIN;
 	}

+ 27 - 1
src/datawizard/copy_driver.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2016-2017                      Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010-2011,2013,2015-2018                 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -689,6 +689,32 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 	return 0;
 }
 
+void starpu_interface_start_driver_copy_async(unsigned src_node, unsigned dst_node, double *start)
+{
+	*start = starpu_timing_now();
+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+}
+
+void starpu_interface_end_driver_copy_async(unsigned src_node, unsigned dst_node, double start)
+{
+	double end = starpu_timing_now();
+	double elapsed = end - start;
+	if (elapsed > 300)
+	{
+		static int warned = 0;
+		if (!warned)
+		{
+			char src_name[16], dst_name[16];
+			warned = 1;
+			_starpu_memory_node_get_name(src_node, src_name, sizeof(src_name));
+			_starpu_memory_node_get_name(dst_node, dst_name, sizeof(dst_name));
+
+			_STARPU_DISP("Warning: the submission of asynchronous transfer from %s to %s took a very long time (%f ms)\nFor proper asynchronous transfer overlapping, data registered to StarPU must be allocated with starpu_malloc() or pinned with starpu_memory_pin()\n", src_name, dst_name, elapsed / 1000.);
+		}
+	}
+	_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+}
+
 /* This can be used by interfaces to easily transfer a piece of data without
  * caring about the particular transfer methods.  */
 

+ 7 - 5
src/datawizard/interfaces/block_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2012,2017                           Inria
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2018                                Université de Bordeaux
  * Copyright (C) 2010-2017                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -528,12 +528,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		}
 		else
 		{
+			double start;
 			/* Are all plans contiguous */
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
 					(char *)src_block->ptr, src_block->ldz*elemsize,
 					nx*ny*elemsize, nz, kind, stream);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
@@ -559,12 +560,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		{
 			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
 			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
+			double start;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
                                                   (char *)src_ptr, src_block->ldy*elemsize,
                                                   nx*elemsize, ny, kind, stream);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
 			if (STARPU_UNLIKELY(cures))
 			{

+ 7 - 5
src/datawizard/interfaces/matrix_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2012,2017                           Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010-2017                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -418,11 +418,12 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
 	if (is_async)
 	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		double start;
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -466,9 +467,10 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
 	if (is_async)
 	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		double start;
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 		cures = cudaMemcpy3DPeerAsync(&p, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 		if (!cures)
 			return -EAGAIN;
 	}

+ 4 - 3
src/datawizard/interfaces/multiformat_interface.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2012                                Inria
  * Copyright (C) 2012-2017                                CNRS
- * Copyright (C) 2013,2015-2016                           Université de Bordeaux
+ * Copyright (C) 2013,2015-2016,2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -528,11 +528,12 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 
 	if (stream)
 	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		double start;
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 		status = cudaMemcpyPeerAsync(dst_multiformat->cuda_ptr, dst_dev,
 					     src_multiformat->cuda_ptr, src_dev,
 					     size, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 		/* All good ! Still, returning -EAGAIN, because we will need to
                    check the transfert completion later */
 		if (status == cudaSuccess)

+ 3 - 2
src/drivers/cuda/driver_cuda.c

@@ -1140,7 +1140,8 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
 	if (stream)
 	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		double start;
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		if (peer_copy)
 		{
@@ -1153,7 +1154,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 		{
 			cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
 		}
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	}
 
 	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */

+ 15 - 10
src/drivers/opencl/driver_opencl.c

@@ -297,15 +297,16 @@ cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTR
 {
 	cl_int err;
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	double start = 0.;
 
 	if (event)
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 
 	cl_event ev;
 	err = clEnqueueWriteBuffer(in_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
 
 	if (event)
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
 	if (STARPU_LIKELY(err == CL_SUCCESS))
 	{
@@ -336,13 +337,14 @@ cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_
 {
 	cl_int err;
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	double start = 0.;
 
 	if (event)
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 	cl_event ev;
 	err = clEnqueueReadBuffer(out_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
 	if (event)
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	if (STARPU_LIKELY(err == CL_SUCCESS))
 	{
 		if (event == NULL)
@@ -372,13 +374,14 @@ cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node STARPU_
 {
 	cl_int err;
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	double start = 0.;
 
 	if (event)
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 	cl_event ev;
 	err = clEnqueueCopyBuffer(peer_transfer_queues[worker->devid], src, dst, src_offset, dst_offset, size, 0, NULL, &ev);
 	if (event)
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	if (STARPU_LIKELY(err == CL_SUCCESS))
 	{
 		if (event == NULL)
@@ -451,15 +454,16 @@ cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node S
         cl_int err;
         struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
+	double start = 0.;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
         if (event)
-                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+                starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
         err = clEnqueueReadBufferRect(out_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                       buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
 	clFlush(out_transfer_queues[worker->devid]);
         if (event)
-                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+                starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 
         return CL_SUCCESS;
@@ -472,15 +476,16 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARP
         cl_int err;
         struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
+	double start = 0.;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
         if (event)
-                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+                starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
         err = clEnqueueWriteBufferRect(in_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                        buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
 	clFlush(in_transfer_queues[worker->devid]);
         if (event)
-                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+                starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 
         return CL_SUCCESS;