лет назад: 16 · d0142a949b
--- a/examples/starpufft/cudax_kernels.cu
+++ b/examples/starpufft/cudax_kernels.cu
@@ -23,18 +23,18 @@
 
																 	unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
															
 
																 	unsigned numthreads = blockDim.x * gridDim.x;
															
 
																-#define DISTRIB_1d(n, func,args,stream) \
															
 
																+#define DISTRIB_1d(n, func,args) \
															
 
																 	unsigned threads_per_block = 128; \
															
 
																 \
															
 
																 	if (n < threads_per_block) { \
															
 
																 		dim3 dimGrid(n); \
															
 
																-		func <<<dimGrid, 1, stream>>> args; \
															
 
																+		func <<<dimGrid, 1>>> args; \
															
 
																 	} else { \
															
 
																 		dim3 dimGrid(n / threads_per_block); \
															
 
																 		dim3 dimBlock(threads_per_block); \
															
 
																-		func <<<dimGrid, dimBlock, stream>>> args; \
															
 
																+		func <<<dimGrid, dimBlock>>> args; \
															
 
																 	} \
															
 
																-	cudaStreamSynchronize(stream); \
															
 
																+	cudaThreadSynchronize(); \
															
 
																 extern "C" __global__ void
															
 
																 STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
															
@@ -48,9 +48,9 @@ STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i
 
																 }
															
 
																 extern "C" void
															
 
																-STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream)
															
 
																+STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
															
 
																 {
															
 
																-	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2), stream);
															
 
																+	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2));
															
 
																 }
															
 
																 extern "C" __global__ void
															
@@ -66,9 +66,9 @@ STARPUFFT(cuda_twiddle_1d)(_cuComplex * out, const _cuComplex * roots, unsigned
 
																 }
															
 
																 extern "C" void
															
 
																-STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream)
															
 
																+STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i)
															
 
																 {
															
 
																-	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i), stream);
															
 
																+	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i));
															
 
																 }
															
 
																 #define VARS_2d \
															
@@ -78,29 +78,29 @@ STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsign
 
																 	unsigned numthreadsy = blockDim.y * gridDim.y;
															
 
																 /* FIXME: introduce threads_per_dim_n / m instead */
															
 
																-#define DISTRIB_2d(n, m, func, args, stream) \
															
 
																+#define DISTRIB_2d(n, m, func, args) \
															
 
																 	unsigned threads_per_dim = 16; \
															
 
																 	if (n < threads_per_dim) { \
															
 
																 		if (m < threads_per_dim) { \
															
 
																 			dim3 dimGrid(n, m); \
															
 
																-			func <<<dimGrid, 1, stream>>> args; \
															
 
																+			func <<<dimGrid, 1>>> args; \
															
 
																 		} else { \
															
 
																 			dim3 dimGrid(1, m / threads_per_dim); \
															
 
																 			dim3 dimBlock(n, threads_per_dim); \
															
 
																-			func <<<dimGrid, dimBlock, stream>>> args; \
															
 
																+			func <<<dimGrid, dimBlock>>> args; \
															
 
																 		} \
															
 
																 	} else {  \
															
 
																 		if (m < threads_per_dim) { \
															
 
																 			dim3 dimGrid(n / threads_per_dim, 1); \
															
 
																 			dim3 dimBlock(threads_per_dim, m); \
															
 
																-			func <<<dimGrid, dimBlock, stream>>> args; \
															
 
																+			func <<<dimGrid, dimBlock>>> args; \
															
 
																 		} else { \
															
 
																 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
															
 
																 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
															
 
																-			func <<<dimGrid, dimBlock, stream>>> args; \
															
 
																+			func <<<dimGrid, dimBlock>>> args; \
															
 
																 		} \
															
 
																 	} \
															
 
																-	cudaStreamSynchronize(stream);
															
 
																+	cudaThreadSynchronize(); \
															
 
																 extern "C" __global__ void
															
 
																 STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
															
@@ -117,9 +117,9 @@ STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i
 
																 }
															
 
																 extern "C" void
															
 
																-STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream)
															
 
																+STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
															
 
																 {
															
 
																-	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2), stream);
															
 
																+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2));
															
 
																 }
															
 
																 extern "C" __global__ void
															
@@ -137,7 +137,7 @@ STARPUFFT(cuda_twiddle_2d)(_cuComplex * out, const _cuComplex * roots0, const _c
 
																 }
															
 
																 extern "C" void
															
 
																-STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream)
															
 
																+STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
															
 
																 {
															
 
																-	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j), stream);
															
 
																+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j));
															
 
																 }
															
--- a/examples/starpufft/cudax_kernels.h
+++ b/examples/starpufft/cudax_kernels.h
@@ -15,7 +15,7 @@
 
																  */
															
 
																 #include <cuComplex.h>
															
 
																-_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream);
															
 
																-_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream);
															
 
																-_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream);
															
 
																-_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream);
															
 
																+_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
															
 
																+_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
															
 
																+_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
															
 
																+_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j);
															
--- a/examples/starpufft/starpufftx.c
+++ b/examples/starpufft/starpufftx.c
@@ -85,11 +85,6 @@ struct STARPUFFT(plan) {
 
																 		cufftHandle plan1_cuda, plan2_cuda;
															
 
																 		/* Whether the plans above are initialized */
															
 
																 		int initialized1, initialized2;
															
 
																-		/* The stream used on that GPU: FIXME: is this really still
															
 
																-		 * needed? */
															
 
																-		cudaStream_t stream;
															
 
																-		/* Whether the stream above is initialized */
															
 
																-		int stream_is_initialized;
															
 
																 #endif
															
 
																 #ifdef STARPU_HAVE_FFTW
															
 
																 		/* FFTW plans */
															
@@ -119,21 +114,6 @@ struct STARPUFFT(args) {
 
																 	int i, j, jj, kk, ll, *iv, *kkv;
															
 
																 };
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-cudaStream_t
															
 
																-STARPUFFT(get_local_stream)(STARPUFFT(plan) plan, int workerid)
															
 
																-{
															
 
																-	if (!plan->plans[workerid].stream_is_initialized)
															
 
																-	{
															
 
																-		cudaStreamCreate(&plan->plans[workerid].stream);
															
 
																-
															
 
																-		plan->plans[workerid].stream_is_initialized = 1;
															
 
																-	}
															
 
																-
															
 
																-	return plan->plans[workerid].stream;
															
 
																-}
															
 
																-#endif
															
 
																-
															
 
																 static void
															
 
																 check_dims(STARPUFFT(plan) plan)
															
 
																 {
															
--- a/examples/starpufft/starpufftx1d.c
+++ b/examples/starpufft/starpufftx1d.c
@@ -69,11 +69,9 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	_cufftComplex * restrict in = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[1]);
															
 
																-	cudaStream_t stream = STARPUFFT(get_local_stream)(plan, starpu_worker_get_id());
															
 
																+	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
															
 
																-	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2, stream);
															
 
																-
															
 
																-	cudaStreamSynchronize(stream);
															
 
																+	cudaThreadSynchronize();
															
 
																 }
															
 
																 /* fft1:
															
@@ -96,26 +94,19 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	task_per_worker[workerid]++;
															
 
																-	cudaStream_t stream;
															
 
																-
															
 
																 	if (!plan->plans[workerid].initialized1) {
															
 
																 		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
															
 
																-		stream = STARPUFFT(get_local_stream)(plan, workerid);
															
 
																-		cufftSetStream(plan->plans[workerid].plan1_cuda, stream);
															
 
																-
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 		plan->plans[workerid].initialized1 = 1;
															
 
																 	}
															
 
																-	stream = plan->plans[workerid].stream;
															
 
																-
															
 
																 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
															
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i, stream);
															
 
																+	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
															
 
																-	cudaStreamSynchronize(plan->plans[workerid].stream);
															
 
																+	cudaThreadSynchronize();
															
 
																 }
															
 
																 /* fft2:
															
@@ -141,9 +132,6 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	if (!plan->plans[workerid].initialized2) {
															
 
																 		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
															
 
																-		cudaStream_t stream = STARPUFFT(get_local_stream)(plan, workerid);
															
 
																-		cufftSetStream(plan->plans[workerid].plan2_cuda, stream);
															
 
																-
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 		plan->plans[workerid].initialized2 = 1;
															
 
																 	}
															
@@ -152,7 +140,7 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
															
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-	cudaStreamSynchronize(plan->plans[workerid].stream);
															
 
																+	cudaThreadSynchronize();
															
 
																 }
															
 
																 #endif
															
--- a/examples/starpufft/starpufftx2d.c
+++ b/examples/starpufft/starpufftx2d.c
@@ -39,10 +39,8 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	_cufftComplex * restrict in = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[0]);
															
 
																 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[1]);
															
 
																-	cudaStream_t stream = STARPUFFT(get_local_stream)(plan, starpu_worker_get_id());
															
 
																-
															
 
																-	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2, stream);
															
 
																-	cudaStreamSynchronize(stream);
															
 
																+	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
															
 
																+	cudaThreadSynchronize();
															
 
																 }
															
 
																 /* Perform an n2,m2 fft */
															
@@ -66,27 +64,20 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	task_per_worker[workerid]++;
															
 
																-	cudaStream_t stream;
															
 
																-
															
 
																 	if (!plan->plans[workerid].initialized1) {
															
 
																 		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
															
 
																-		stream = STARPUFFT(get_local_stream)(plan, workerid);
															
 
																-		cufftSetStream(plan->plans[workerid].plan1_cuda, stream);
															
 
																-
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 		plan->plans[workerid].initialized1 = 1;
															
 
																 	}
															
 
																-	stream = plan->plans[workerid].stream;
															
 
																-
															
 
																 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
															
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 	/* synchronization is done after the twiddling */
															
 
																-	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j, stream);
															
 
																+	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
															
 
																-	cudaStreamSynchronize(stream);
															
 
																+	cudaThreadSynchronize();
															
 
																 }
															
 
																 static void
															
@@ -113,9 +104,6 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	if (!plan->plans[workerid].initialized2) {
															
 
																 		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
															
 
																-		cudaStream_t stream = STARPUFFT(get_local_stream)(plan, workerid);
															
 
																-		cufftSetStream(plan->plans[workerid].plan2_cuda, stream);
															
 
																-
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 		plan->plans[workerid].initialized2 = 1;
															
 
																 	}
															
@@ -125,7 +113,7 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 	}
															
 
																-	cudaStreamSynchronize(plan->plans[workerid].stream);
															
 
																+	cudaThreadSynchronize();
															
 
																 }
															
 
																 #endif