hace 5 años · 7f136c4dd2
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -102,7 +102,7 @@ to use a version that takes the a stream parameter.
 
				 Unfortunately, some CUDA libraries do not have stream variants of
			
 
				 kernels. This will seriously lower the potential for overlapping.
			
 
				 If some CUDA calls are made without specifying this local stream,
			
 
				-synchronization needs to be explicited with cudaThreadSynchronize() around these
			
 
				+synchronization needs to be explicited with cudaDeviceSynchronize() around these
			
 
				 calls, to make sure that they get properly synchronized with the calls using
			
 
				 the local stream. Notably, \c cudaMemcpy() and \c cudaMemset() are actually
			
 
				 asynchronous and need such explicit synchronization! Use \c cudaMemcpyAsync() and
			
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -235,7 +235,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, void *_a
 
				 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
			
 
				 			cudaError_t cures = cudaStreamSynchronize(stream);
			
 
				 #else
			
 
				-			cudaError_t cures = cudaThreadSynchronize();
			
 
				+			cudaError_t cures = cudaDeviceSynchronize();
			
 
				 #endif
			
 
				 			STARPU_ASSERT(!cures);
			
 
				 			}
			
--- a/examples/sched_ctx/axpy_partition_gpu.h
+++ b/examples/sched_ctx/axpy_partition_gpu.h
@@ -126,7 +126,7 @@ static void buildPartitionedBlockMapping(F cudaFun, int threads, int shmem, int
 
				 
			
 
				   cudaMemcpyAsync((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice, current_stream);
			
 
				   //cudaMemcpy((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice);
			
 
				-  //cudaThreadSynchronize();
			
 
				+  //cudaDeviceSynchronize();
			
 
				 }
			
 
				 
			
 
				 
			
--- a/include/starpu_cuda.h
+++ b/include/starpu_cuda.h
@@ -64,7 +64,7 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 
				    stream by hand. Note that the application is not forced to use the
			
 
				    stream provided by starpu_cuda_get_local_stream() and may also
			
 
				    create its own streams. Synchronizing with
			
 
				-   <c>cudaThreadSynchronize()</c> is allowed, but will reduce the
			
 
				+   <c>cudaDeviceSynchronize()</c> is allowed, but will reduce the
			
 
				    likelihood of having all transfers overlapped.
			
 
				 */
			
 
				 cudaStream_t starpu_cuda_get_local_stream(void);
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c
@@ -216,7 +216,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, void *_a
 
				 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
			
 
				 			cudaError_t cures = cudaStreamSynchronize(stream);
			
 
				 #else
			
 
				-				cudaError_t cures = cudaThreadSynchronize();
			
 
				+				cudaError_t cures = cudaDeviceSynchronize();
			
 
				 #endif
			
 
				 				STARPU_ASSERT(!cures);
			
 
				 			}
			
--- a/sc_hypervisor/examples/cholesky/cholesky_kernels.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_kernels.c
@@ -197,7 +197,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, void *_a
 
				 				fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				 				STARPU_ABORT();
			
 
				 			}
			
 
				-			cudaError_t cures = cudaThreadSynchronize();
			
 
				+			cudaError_t cures = cudaDeviceSynchronize();
			
 
				 			STARPU_ASSERT(!cures);
			
 
				 			}
			
 
				 #else
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -216,7 +216,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	/* Fill them */
			
 
				 	memset(h_buffer, 0, size);
			
 
				 	cudaMemset(d_buffer, 0, size);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaDeviceSynchronize();
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
@@ -232,7 +232,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
 
				 		cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
			
 
				-		cudaThreadSynchronize();
			
 
				+		cudaDeviceSynchronize();
			
 
				 	}
			
 
				 	end = starpu_timing_now();
			
 
				 	timing = end - start;
			
@@ -244,7 +244,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
 
				 		cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
			
 
				-		cudaThreadSynchronize();
			
 
				+		cudaDeviceSynchronize();
			
 
				 	}
			
 
				 	end = starpu_timing_now();
			
 
				 	timing = end - start;
			
@@ -256,7 +256,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
 
				 		cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
			
 
				-		cudaThreadSynchronize();
			
 
				+		cudaDeviceSynchronize();
			
 
				 	}
			
 
				 	end = starpu_timing_now();
			
 
				 	timing = end - start;
			
@@ -268,7 +268,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
 
				 		cudaMemcpy(h_buffer, d_buffer, 1, cudaMemcpyDeviceToHost);
			
 
				-		cudaThreadSynchronize();
			
 
				+		cudaDeviceSynchronize();
			
 
				 	}
			
 
				 	end = starpu_timing_now();
			
 
				 	timing = end - start;
			
@@ -335,7 +335,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	cures = cudaMalloc((void **)&s_buffer, size);
			
 
				 	STARPU_ASSERT(cures == cudaSuccess);
			
 
				 	cudaMemset(s_buffer, 0, size);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaDeviceSynchronize();
			
 
				 
			
 
				 	/* Initialize CUDA context on the destination */
			
 
				 	/* We do not need to enable OpenGL interoperability at this point,
			
@@ -361,7 +361,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	cures = cudaMalloc((void **)&d_buffer, size);
			
 
				 	STARPU_ASSERT(cures == cudaSuccess);
			
 
				 	cudaMemset(d_buffer, 0, size);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaDeviceSynchronize();
			
 
				 
			
 
				 	unsigned iter;
			
 
				 	double timing;
			
@@ -373,7 +373,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
 
				 		cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
			
 
				-		cudaThreadSynchronize();
			
 
				+		cudaDeviceSynchronize();
			
 
				 	}
			
 
				 	end = starpu_timing_now();
			
 
				 	timing = end - start;
			
@@ -385,7 +385,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
 
				 		cudaMemcpyPeer(d_buffer, dst, s_buffer, src, 1);
			
 
				-		cudaThreadSynchronize();
			
 
				+		cudaDeviceSynchronize();
			
 
				 	}
			
 
				 	end = starpu_timing_now();
			
 
				 	timing = end - start;
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -476,7 +476,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				                                              (char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				                                              nx*ny*elemsize, nz, kind);
			
 
				 			if (!cures)
			
 
				-				cures = cudaThreadSynchronize();
			
 
				+				cures = cudaDeviceSynchronize();
			
 
				                         if (STARPU_UNLIKELY(cures))
			
 
				                                 STARPU_CUDA_REPORT_ERROR(cures);
			
 
				                 }
			
@@ -496,7 +496,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				                                              nx*elemsize, ny, kind);
			
 
				 
			
 
				 			if (!cures)
			
 
				-				cures = cudaThreadSynchronize();
			
 
				+				cures = cudaDeviceSynchronize();
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 		}
			
@@ -545,7 +545,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 						(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				 						nx*ny*elemsize, nz, kind);
			
 
				 				if (!cures)
			
 
				-					cures = cudaThreadSynchronize();
			
 
				+					cures = cudaDeviceSynchronize();
			
 
				 				if (STARPU_UNLIKELY(cures))
			
 
				 					STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
@@ -604,7 +604,7 @@ no_async_default:
 
				                                      nx*elemsize, ny, kind);
			
 
				 
			
 
				 		if (!cures)
			
 
				-			cures = cudaThreadSynchronize();
			
 
				+			cures = cudaDeviceSynchronize();
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -481,7 +481,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				 		src_matrix->nx*elemsize, src_matrix->ny, kind);
			
 
				 	if (!cures)
			
 
				-		cures = cudaThreadSynchronize();
			
 
				+		cures = cudaDeviceSynchronize();
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 	{
			
 
				 		int ret = copy_any_to_any(src_interface, src_node, dst_interface, dst_node, (void*)(uintptr_t)is_async);
			
@@ -528,7 +528,7 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
				 
			
 
				 	cures = cudaMemcpy3DPeer(&p);
			
 
				 	if (!cures)
			
 
				-		cures = cudaThreadSynchronize();
			
 
				+		cures = cudaDeviceSynchronize();
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -414,7 +414,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 			}
			
 
				 			status = cudaMemcpy(dst_multiformat->cpu_ptr, src_multiformat->cpu_ptr, size, kind);
			
 
				 			if (!status)
			
 
				-				status = cudaThreadSynchronize();
			
 
				+				status = cudaDeviceSynchronize();
			
 
				 			if (STARPU_UNLIKELY(status))
			
 
				 				STARPU_CUDA_REPORT_ERROR(status);
			
 
				 			break;
			
@@ -424,7 +424,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				 			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
			
 
				 			if (!status)
			
 
				-				status = cudaThreadSynchronize();
			
 
				+				status = cudaDeviceSynchronize();
			
 
				 			if (STARPU_UNLIKELY(status))
			
 
				 				STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
@@ -435,7 +435,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				 			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
			
 
				 			if (!status)
			
 
				-				status = cudaThreadSynchronize();
			
 
				+				status = cudaDeviceSynchronize();
			
 
				 			if (STARPU_UNLIKELY(status))
			
 
				 				STARPU_CUDA_REPORT_ERROR(status);
			
 
				 			break;
			
@@ -494,7 +494,7 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 
				 			size = src_multiformat->nx * src_multiformat->ops->cuda_elemsize;
			
 
				 			status = cudaMemcpy(dst_multiformat->cuda_ptr, src_multiformat->cuda_ptr, size, kind);
			
 
				 			if (!status)
			
 
				-				status = cudaThreadSynchronize();
			
 
				+				status = cudaDeviceSynchronize();
			
 
				 			if (STARPU_UNLIKELY(status))
			
 
				 				STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
@@ -565,7 +565,7 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 
				 				src_multiformat->cuda_ptr, src_dev,
			
 
				 				size);
			
 
				 	if (!status)
			
 
				-		status = cudaThreadSynchronize();
			
 
				+		status = cudaDeviceSynchronize();
			
 
				 	if (STARPU_UNLIKELY(status != cudaSuccess))
			
 
				 		STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
--- a/src/datawizard/interfaces/tensor_interface.c
+++ b/src/datawizard/interfaces/tensor_interface.c
@@ -541,7 +541,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				                                              nx*elemsize, ny, kind);
			
 
				 
			
 
				 			if (!cures)
			
 
				-				cures = cudaThreadSynchronize();
			
 
				+				cures = cudaDeviceSynchronize();
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 		    }
			
@@ -629,7 +629,7 @@ no_async_default:
 
				                                      nx*elemsize, ny, kind);
			
 
				 
			
 
				 		if (!cures)
			
 
				-			cures = cudaThreadSynchronize();
			
 
				+			cures = cudaDeviceSynchronize();
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	    }
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -1178,7 +1178,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
				 		}
			
 
				 
			
 
				 		if (!cures)
			
 
				-			cures = cudaThreadSynchronize();
			
 
				+			cures = cudaDeviceSynchronize();
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
--- a/starpufft/tests/testx.c
+++ b/starpufft/tests/testx.c
@@ -243,7 +243,7 @@ int main(int argc, char *argv[])
 
				 	gettimeofday(&begin, NULL);
			
 
				 	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
			
 
				 		printf("erf2\n");
			
 
				-	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
			
 
				+	if ((cures = cudaDeviceSynchronize()) != cudaSuccess)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	gettimeofday(&end, NULL);
			
 
				 	cufftDestroy(cuda_plan);
			
--- a/tests/datawizard/gpu_ptr_register.c
+++ b/tests/datawizard/gpu_ptr_register.c
@@ -146,7 +146,7 @@ test_cuda(void)
 
				 	starpu_cuda_set_device(devid);
			
 
				 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
			
 
				 	if (!cures)
			
 
				-		cures = cudaThreadSynchronize();
			
 
				+		cures = cudaDeviceSynchronize();
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -115,7 +115,7 @@ test_cuda(void)
 
				 
			
 
				 	cures = cudaMemcpy(foo_gpu, foo, size * sizeof(*foo_gpu), cudaMemcpyHostToDevice);
			
 
				 	if (!cures)
			
 
				-		cures = cudaThreadSynchronize();
			
 
				+		cures = cudaDeviceSynchronize();
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
@@ -153,7 +153,7 @@ test_cuda(void)
 
				 	starpu_cuda_set_device(devid);
			
 
				 	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
			
 
				 	if (!cures)
			
 
				-		cures = cudaThreadSynchronize();
			
 
				+		cures = cudaDeviceSynchronize();
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 	{
			
 
				 		starpu_free_on_node(starpu_worker_get_memory_node(chosen), (uintptr_t) foo_gpu, size * sizeof(*foo_gpu));
			
--- a/tests/experiments/latency/cuda_latency.c
+++ b/tests/experiments/latency/cuda_latency.c
@@ -63,7 +63,7 @@ void send_data(unsigned src, unsigned dst)
 
				 	cures = cudaMemcpy(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost);
			
 
				 	STARPU_ASSERT(!cures);
			
 
				 
			
 
				-	cures = cudaThreadSynchronize();
			
 
				+	cures = cudaDeviceSynchronize();
			
 
				 	STARPU_ASSERT(!cures);
			
 
				 #endif
			
 
				 #endif
			
@@ -101,7 +101,7 @@ void recv_data(unsigned src, unsigned dst)
 
				 	cures = cudaMemcpy(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice);
			
 
				 	STARPU_ASSERT(!cures);
			
 
				 
			
 
				-	cures = cudaThreadSynchronize();
			
 
				+	cures = cudaDeviceSynchronize();
			
 
				 	STARPU_ASSERT(!cures);
			
 
				 #endif
			
 
				 #endif
			
@@ -127,7 +127,7 @@ void *launch_gpu_thread(void *arg)
 
				 		cudaError_t cures;
			
 
				 		cures = cudaHostAlloc(&cpu_buffer, buffer_size, cudaHostAllocPortable);
			
 
				 		STARPU_ASSERT(!cures);
			
 
				-		cudaThreadSynchronize();
			
 
				+		cudaDeviceSynchronize();
			
 
				 	}
			
 
				 
			
 
				 	nready_gpu++;