Просмотр исходного кода

Add missing synchronization after cudaMemcpy calls

Samuel Thibault лет назад: 6
Родитель
Сommit
38314bc464

+ 2 - 1
examples/sched_ctx/axpy_partition_gpu.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2016                                     Inria
  * Copyright (C) 2017                                     CNRS
- * Copyright (C) 2016                                     Université de Bordeaux
+ * Copyright (C) 2016,2019                                Université de Bordeaux
  * Copyright (C) 2016                                     Uppsala University
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -126,6 +126,7 @@ static void buildPartitionedBlockMapping(F cudaFun, int threads, int shmem, int
 
   cudaMemcpyAsync((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice, current_stream);
   //cudaMemcpy((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice);
+  //cudaThreadSynchronize();
 }
 
 

+ 9 - 1
src/datawizard/interfaces/block_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2018                                Université de Bordeaux
+ * Copyright (C) 2009-2019                                Université de Bordeaux
  * Copyright (C) 2011,2012,2017                           Inria
  * Copyright (C) 2010-2017                                CNRS
  *
@@ -484,6 +484,8 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
                         cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
                                              (char *)src_block->ptr, src_block->ldz*elemsize,
                                              nx*ny*elemsize, nz, kind);
+			if (!cures)
+				cures = cudaThreadSynchronize();
                         if (STARPU_UNLIKELY(cures))
                                 STARPU_CUDA_REPORT_ERROR(cures);
                 }
@@ -502,6 +504,8 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
                                              (char *)src_ptr, src_block->ldy*elemsize,
                                              nx*elemsize, ny, kind);
 
+			if (!cures)
+				cures = cudaThreadSynchronize();
 			if (STARPU_UNLIKELY(cures))
 				STARPU_CUDA_REPORT_ERROR(cures);
 		}
@@ -549,6 +553,8 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
 						(char *)src_block->ptr, src_block->ldz*elemsize,
 						nx*ny*elemsize, nz, kind);
+				if (!cures)
+					cures = cudaThreadSynchronize();
 				if (STARPU_UNLIKELY(cures))
 					STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -606,6 +612,8 @@ no_async_default:
                                      (char *)src_ptr, src_block->ldy*elemsize,
                                      nx*elemsize, ny, kind);
 
+		if (!cures)
+			cures = cudaThreadSynchronize();
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}

+ 5 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2018                                Université de Bordeaux
+ * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2011,2012,2017                           Inria
  * Copyright (C) 2010-2017                                CNRS
  *
@@ -440,6 +440,8 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 		src_matrix->nx*elemsize, src_matrix->ny, kind);
+	if (!cures)
+		cures = cudaThreadSynchronize();
 	if (STARPU_UNLIKELY(cures))
 	{
 		int ret = copy_any_to_any(src_interface, src_node, dst_interface, dst_node, (void*)(uintptr_t)is_async);
@@ -485,6 +487,8 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	}
 
 	cures = cudaMemcpy3DPeer(&p);
+	if (!cures)
+		cures = cudaThreadSynchronize();
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 

+ 11 - 3
src/datawizard/interfaces/multiformat_interface.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011,2012                                Inria
  * Copyright (C) 2011-2017                                CNRS
- * Copyright (C) 2011-2013,2015,2016,2018                 Université de Bordeaux
+ * Copyright (C) 2011-2013,2015,2016,2018-2019            Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -430,16 +430,18 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 					return -ENOMEM;
 			}
 			status = cudaMemcpy(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size, kind);
+			if (!status)
+				status = cudaThreadSynchronize();
 			if (STARPU_UNLIKELY(status))
-			{
 				STARPU_CUDA_REPORT_ERROR(status);
-			}
 			break;
 		}
 		case cudaMemcpyDeviceToHost:
 		{
 			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
 			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (!status)
+				status = cudaThreadSynchronize();
 			if (STARPU_UNLIKELY(status))
 				STARPU_CUDA_REPORT_ERROR(status);
 
@@ -449,6 +451,8 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 		{
 			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
 			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (!status)
+				status = cudaThreadSynchronize();
 			if (STARPU_UNLIKELY(status))
 				STARPU_CUDA_REPORT_ERROR(status);
 			break;
@@ -506,6 +510,8 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 		{
 			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
 			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
+			if (!status)
+				status = cudaThreadSynchronize();
 			if (STARPU_UNLIKELY(status))
 				STARPU_CUDA_REPORT_ERROR(status);
 
@@ -575,6 +581,8 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 	status = cudaMemcpyPeer(dst_multiformat->cuda_ptr, dst_dev,
 				src_multiformat->cuda_ptr, src_dev,
 				size);
+	if (!status)
+		status = cudaThreadSynchronize();
 	if (STARPU_UNLIKELY(status != cudaSuccess))
 		STARPU_CUDA_REPORT_ERROR(status);
 

+ 3 - 2
src/drivers/cuda/driver_cuda.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2014,2016,2017                 Inria
- * Copyright (C) 2008-2018                                Université de Bordeaux
+ * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -1173,7 +1173,8 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 			cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
 		}
 
-
+		if (!cures)
+			cures = cudaThreadSynchronize();
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 

+ 5 - 3
tests/datawizard/data_implicit_deps.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010,2011,2013-2016                      Université de Bordeaux
+ * Copyright (C) 2010,2011,2013-2016,2019                 Université de Bordeaux
  * Copyright (C) 2011-2013                                Inria
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  *
@@ -77,7 +77,8 @@ void g_cuda(void *descr[], void *arg)
 	unsigned value = 42;
 
 	usleep(100000);
-	cudaMemcpy(val, &value, sizeof(value), cudaMemcpyHostToDevice);
+	cudaMemcpyAsync(val, &value, sizeof(value), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -114,7 +115,8 @@ void h_cuda(void *descr[], void *arg)
 	unsigned *val = (unsigned *) STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned value;
 
-	cudaMemcpy(&value, val, sizeof(value), cudaMemcpyDeviceToHost);
+	cudaMemcpyAsync(&value, val, sizeof(value), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	FPRINTF(stderr, "VAR %u (should be 42)\n", value);
 	STARPU_ASSERT(value == 42);
 }

+ 3 - 1
tests/datawizard/gpu_ptr_register.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2013,2015-2017                      CNRS
- * Copyright (C) 2011-2014,2016                           Université de Bordeaux
+ * Copyright (C) 2011-2014,2016,2019                      Université de Bordeaux
  * Copyright (C) 2012                                     Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -145,6 +145,8 @@ test_cuda(void)
 
 	starpu_cuda_set_device(devid);
 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
+	if (!cures)
+		cures = cudaThreadSynchronize();
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 

+ 5 - 1
tests/datawizard/gpu_register.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2012,2013,2015-2017                      CNRS
- * Copyright (C) 2011-2016                                Université de Bordeaux
+ * Copyright (C) 2011-2016,2019                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -114,6 +114,8 @@ test_cuda(void)
 		foo[i] = i;
 
 	cures = cudaMemcpy(foo_gpu, foo, size * sizeof(*foo_gpu), cudaMemcpyHostToDevice);
+	if (!cures)
+		cures = cudaThreadSynchronize();
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -150,6 +152,8 @@ test_cuda(void)
 
 	starpu_cuda_set_device(devid);
 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
+	if (!cures)
+		cures = cudaThreadSynchronize();
 	if (STARPU_UNLIKELY(cures))
 	{
 		starpu_free_on_node(starpu_worker_get_memory_node(chosen), (uintptr_t) foo_gpu, size * sizeof(*foo_gpu));

+ 4 - 4
tests/datawizard/interfaces/bcsr/bcsr_cuda.cu

@@ -52,20 +52,20 @@ extern "C" void test_bcsr_cuda_func(void *buffers[], void *args)
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
-	error = cudaMemcpy(ret,
+	error = cudaMemcpyAsync(ret,
 			   &bcsr_config.copy_failed,
 			   sizeof(int),
-			   cudaMemcpyHostToDevice);
+			   cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         bcsr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
 		(val, nnz, ret, factor);
 
-	error = cudaMemcpy(&bcsr_config.copy_failed,
+	error = cudaMemcpyAsync(&bcsr_config.copy_failed,
 			   ret,
 			   sizeof(int),
-			   cudaMemcpyDeviceToHost);
+			   cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 

+ 2 - 2
tests/datawizard/interfaces/block/block_cuda.cu

@@ -57,7 +57,7 @@ extern "C" void test_block_cuda_func(void *buffers[], void *args)
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
-	error = cudaMemcpy(ret, &block_config.copy_failed, sizeof(int), cudaMemcpyHostToDevice);
+	error = cudaMemcpyAsync(ret, &block_config.copy_failed, sizeof(int), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
@@ -71,7 +71,7 @@ extern "C" void test_block_cuda_func(void *buffers[], void *args)
 
         block_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
 		(block, nx, ny, nz, ldy, ldz, factor, ret);
-	error = cudaMemcpy(&block_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost);
+	error = cudaMemcpyAsync(&block_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 

+ 4 - 4
tests/datawizard/interfaces/coo/coo_cuda.cu

@@ -49,20 +49,20 @@ extern "C" void test_coo_cuda_func(void *buffers[], void *args)
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
-	error = cudaMemcpy(ret,
+	error = cudaMemcpyAsync(ret,
 			   &coo_config.copy_failed,
 			   sizeof(int),
-			   cudaMemcpyHostToDevice);
+			   cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         coo_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
 		(val, nvalues, ret, factor);
 
-	error = cudaMemcpy(&coo_config.copy_failed,
+	error = cudaMemcpyAsync(&coo_config.copy_failed,
 			   ret,
 			   sizeof(int),
-			   cudaMemcpyDeviceToHost);
+			   cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 

+ 4 - 4
tests/datawizard/interfaces/csr/csr_cuda.cu

@@ -49,19 +49,19 @@ extern "C" void test_csr_cuda_func(void *buffers[], void *args)
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
-	error = cudaMemcpy(ret,
+	error = cudaMemcpyAsync(ret,
 			   &csr_config.copy_failed,
 			   sizeof(int),
-			   cudaMemcpyHostToDevice);
+			   cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         csr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>> (val, nnz, ret, factor);
 
-	error = cudaMemcpy(&csr_config.copy_failed,
+	error = cudaMemcpyAsync(&csr_config.copy_failed,
 			   ret,
 			   sizeof(int),
-			   cudaMemcpyDeviceToHost);
+			   cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 

+ 4 - 4
tests/datawizard/interfaces/matrix/matrix_cuda.cu

@@ -52,19 +52,19 @@ extern "C" void test_matrix_cuda_func(void *buffers[], void *args)
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
-	error = cudaMemcpy(ret,
+	error = cudaMemcpyAsync(ret,
 			   &matrix_config.copy_failed,
 			   sizeof(int),
-			   cudaMemcpyHostToDevice);
+			   cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         matrix_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
 
-	error = cudaMemcpy(&matrix_config.copy_failed,
+	error = cudaMemcpyAsync(&matrix_config.copy_failed,
 			   ret,
 			   sizeof(int),
-			   cudaMemcpyDeviceToHost);
+			   cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 

+ 4 - 4
tests/datawizard/interfaces/multiformat/multiformat_cuda.cu

@@ -58,19 +58,19 @@ extern "C" void test_multiformat_cuda_func(void *buffers[], void *args)
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
-	error = cudaMemcpy(ret,
+	error = cudaMemcpyAsync(ret,
 			   &multiformat_config.copy_failed,
 			   sizeof(int),
-			   cudaMemcpyHostToDevice);
+			   cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n, ret, factor);
 
-	error = cudaMemcpy(&multiformat_config.copy_failed,
+	error = cudaMemcpyAsync(&multiformat_config.copy_failed,
 			   ret,
 			   sizeof(int),
-			   cudaMemcpyDeviceToHost);
+			   cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 

+ 4 - 4
tests/datawizard/interfaces/variable/variable_cuda.cu

@@ -42,10 +42,10 @@ extern "C" void test_variable_cuda_func(void *buffers[], void *args)
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
-	error = cudaMemcpy(ret,
+	error = cudaMemcpyAsync(ret,
 			   &variable_config.copy_failed,
 			   sizeof(int),
-			   cudaMemcpyHostToDevice);
+			   cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 
@@ -56,10 +56,10 @@ extern "C" void test_variable_cuda_func(void *buffers[], void *args)
 	unsigned nblocks = 1;
 
         variable_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, ret, factor);
-	error = cudaMemcpy(&variable_config.copy_failed,
+	error = cudaMemcpyAsync(&variable_config.copy_failed,
 			   ret,
 			   sizeof(int),
-			   cudaMemcpyDeviceToHost);
+			   cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);
 

+ 2 - 2
tests/datawizard/interfaces/vector/test_vector_cuda.cu

@@ -44,7 +44,7 @@ extern "C" void test_vector_cuda_func(void *buffers[], void *args)
 		return;
 	}
 
-	error = cudaMemcpy(ret, &vector_config.copy_failed, sizeof(int), cudaMemcpyHostToDevice);
+	error = cudaMemcpyAsync(ret, &vector_config.copy_failed, sizeof(int), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		return;
 
@@ -56,7 +56,7 @@ extern "C" void test_vector_cuda_func(void *buffers[], void *args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         framework_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
-	error = cudaMemcpy(&vector_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost);
+	error = cudaMemcpyAsync(&vector_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 	{
 		return;

+ 2 - 0
tests/datawizard/manual_reduction.c

@@ -75,6 +75,8 @@ static void initialize_per_worker_handle(void *arg)
 				STARPU_CUDA_REPORT_ERROR(status);
 			}
 			status = cudaMemsetAsync((void *)per_worker[workerid], 0, sizeof(variable), starpu_cuda_get_local_stream());
+			if (!status)
+				status = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			if (status)
 				STARPU_CUDA_REPORT_ERROR(status);
 			break;

+ 3 - 2
tests/openmp/cuda_task_01.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2015,2017                                CNRS
  * Copyright (C) 2014-2016                                Inria
- * Copyright (C) 2017                                     Université de Bordeaux
+ * Copyright (C) 2017,2019                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -67,8 +67,9 @@ void task_region_g(void *buffers[], void *args)
 	printf("depth 1 task, entry: f = %d\n", f);
 
 	fprintf(stderr, "cudaMemcpy: -->\n");
-	cudaMemcpy(v2,v1,nx1*sizeof(*_vector_1), cudaMemcpyDeviceToDevice);
+	cudaMemcpyAsync(v2,v1,nx1*sizeof(*_vector_1), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 	fprintf(stderr, "cudaMemcpy: <--\n");
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 void master_g1(void *arg)