15 年之前 · d0142a949b
--- a/examples/starpufft/cudax_kernels.cu
+++ b/examples/starpufft/cudax_kernels.cu
@@ -23,18 +23,18 @@
 
				 	unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
			
 
				 	unsigned numthreads = blockDim.x * gridDim.x;
			
 
				 
			
 
				-#define DISTRIB_1d(n, func,args,stream) \
			
 
				+#define DISTRIB_1d(n, func,args) \
			
 
				 	unsigned threads_per_block = 128; \
			
 
				 \
			
 
				 	if (n < threads_per_block) { \
			
 
				 		dim3 dimGrid(n); \
			
 
				-		func <<<dimGrid, 1, stream>>> args; \
			
 
				+		func <<<dimGrid, 1>>> args; \
			
 
				 	} else { \
			
 
				 		dim3 dimGrid(n / threads_per_block); \
			
 
				 		dim3 dimBlock(threads_per_block); \
			
 
				-		func <<<dimGrid, dimBlock, stream>>> args; \
			
 
				+		func <<<dimGrid, dimBlock>>> args; \
			
 
				 	} \
			
 
				-	cudaStreamSynchronize(stream); \
			
 
				+	cudaThreadSynchronize(); \
			
 
				 
			
 
				 extern "C" __global__ void
			
 
				 STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
			
@@ -48,9 +48,9 @@ STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i
 
				 }
			
 
				 
			
 
				 extern "C" void
			
 
				-STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream)
			
 
				+STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
			
 
				 {
			
 
				-	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2), stream);
			
 
				+	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2));
			
 
				 }
			
 
				 
			
 
				 extern "C" __global__ void
			
@@ -66,9 +66,9 @@ STARPUFFT(cuda_twiddle_1d)(_cuComplex * out, const _cuComplex * roots, unsigned
 
				 }
			
 
				 
			
 
				 extern "C" void
			
 
				-STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream)
			
 
				+STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i)
			
 
				 {
			
 
				-	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i), stream);
			
 
				+	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i));
			
 
				 }
			
 
				 
			
 
				 #define VARS_2d \
			
@@ -78,29 +78,29 @@ STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsign
 
				 	unsigned numthreadsy = blockDim.y * gridDim.y;
			
 
				 
			
 
				 /* FIXME: introduce threads_per_dim_n / m instead */
			
 
				-#define DISTRIB_2d(n, m, func, args, stream) \
			
 
				+#define DISTRIB_2d(n, m, func, args) \
			
 
				 	unsigned threads_per_dim = 16; \
			
 
				 	if (n < threads_per_dim) { \
			
 
				 		if (m < threads_per_dim) { \
			
 
				 			dim3 dimGrid(n, m); \
			
 
				-			func <<<dimGrid, 1, stream>>> args; \
			
 
				+			func <<<dimGrid, 1>>> args; \
			
 
				 		} else { \
			
 
				 			dim3 dimGrid(1, m / threads_per_dim); \
			
 
				 			dim3 dimBlock(n, threads_per_dim); \
			
 
				-			func <<<dimGrid, dimBlock, stream>>> args; \
			
 
				+			func <<<dimGrid, dimBlock>>> args; \
			
 
				 		} \
			
 
				 	} else {  \
			
 
				 		if (m < threads_per_dim) { \
			
 
				 			dim3 dimGrid(n / threads_per_dim, 1); \
			
 
				 			dim3 dimBlock(threads_per_dim, m); \
			
 
				-			func <<<dimGrid, dimBlock, stream>>> args; \
			
 
				+			func <<<dimGrid, dimBlock>>> args; \
			
 
				 		} else { \
			
 
				 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
			
 
				 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
			
 
				-			func <<<dimGrid, dimBlock, stream>>> args; \
			
 
				+			func <<<dimGrid, dimBlock>>> args; \
			
 
				 		} \
			
 
				 	} \
			
 
				-	cudaStreamSynchronize(stream);
			
 
				+	cudaThreadSynchronize(); \
			
 
				 
			
 
				 extern "C" __global__ void
			
 
				 STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
			
@@ -117,9 +117,9 @@ STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i
 
				 }
			
 
				 
			
 
				 extern "C" void
			
 
				-STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream)
			
 
				+STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
			
 
				 {
			
 
				-	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2), stream);
			
 
				+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2));
			
 
				 }
			
 
				 
			
 
				 extern "C" __global__ void
			
@@ -137,7 +137,7 @@ STARPUFFT(cuda_twiddle_2d)(_cuComplex * out, const _cuComplex * roots0, const _c
 
				 }
			
 
				 
			
 
				 extern "C" void
			
 
				-STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream)
			
 
				+STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
			
 
				 {
			
 
				-	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j), stream);
			
 
				+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j));
			
 
				 }
			
--- a/examples/starpufft/cudax_kernels.h
+++ b/examples/starpufft/cudax_kernels.h
@@ -15,7 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <cuComplex.h>
			
 
				-_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream);
			
 
				-_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream);
			
 
				-_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream);
			
 
				-_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream);
			
 
				+_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
			
 
				+_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
			
 
				+_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
			
 
				+_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j);
			
--- a/examples/starpufft/starpufftx.c
+++ b/examples/starpufft/starpufftx.c
@@ -85,11 +85,6 @@ struct STARPUFFT(plan) {
 
				 		cufftHandle plan1_cuda, plan2_cuda;
			
 
				 		/* Whether the plans above are initialized */
			
 
				 		int initialized1, initialized2;
			
 
				-		/* The stream used on that GPU: FIXME: is this really still
			
 
				-		 * needed? */
			
 
				-		cudaStream_t stream;
			
 
				-		/* Whether the stream above is initialized */
			
 
				-		int stream_is_initialized;
			
 
				 #endif
			
 
				 #ifdef STARPU_HAVE_FFTW
			
 
				 		/* FFTW plans */
			
@@ -119,21 +114,6 @@ struct STARPUFFT(args) {
 
				 	int i, j, jj, kk, ll, *iv, *kkv;
			
 
				 };
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-cudaStream_t
			
 
				-STARPUFFT(get_local_stream)(STARPUFFT(plan) plan, int workerid)
			
 
				-{
			
 
				-	if (!plan->plans[workerid].stream_is_initialized)
			
 
				-	{
			
 
				-		cudaStreamCreate(&plan->plans[workerid].stream);
			
 
				-
			
 
				-		plan->plans[workerid].stream_is_initialized = 1;
			
 
				-	}
			
 
				-
			
 
				-	return plan->plans[workerid].stream;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 static void
			
 
				 check_dims(STARPUFFT(plan) plan)
			
 
				 {
			
--- a/examples/starpufft/starpufftx1d.c
+++ b/examples/starpufft/starpufftx1d.c
@@ -69,11 +69,9 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 
				 	_cufftComplex * restrict in = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				 	
			
 
				-	cudaStream_t stream = STARPUFFT(get_local_stream)(plan, starpu_worker_get_id());
			
 
				+	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
			
 
				 
			
 
				-	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2, stream);
			
 
				-
			
 
				-	cudaStreamSynchronize(stream);
			
 
				+	cudaThreadSynchronize();
			
 
				 }
			
 
				 
			
 
				 /* fft1:
			
@@ -96,26 +94,19 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
				 
			
 
				 	task_per_worker[workerid]++;
			
 
				 
			
 
				-	cudaStream_t stream;
			
 
				-
			
 
				 	if (!plan->plans[workerid].initialized1) {
			
 
				 		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
			
 
				 
			
 
				-		stream = STARPUFFT(get_local_stream)(plan, workerid);
			
 
				-		cufftSetStream(plan->plans[workerid].plan1_cuda, stream);
			
 
				-
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 		plan->plans[workerid].initialized1 = 1;
			
 
				 	}
			
 
				 
			
 
				-	stream = plan->plans[workerid].stream;
			
 
				-
			
 
				 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 
			
 
				-	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i, stream);
			
 
				+	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
			
 
				 
			
 
				-	cudaStreamSynchronize(plan->plans[workerid].stream);
			
 
				+	cudaThreadSynchronize();
			
 
				 }
			
 
				 
			
 
				 /* fft2:
			
@@ -141,9 +132,6 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
				 	if (!plan->plans[workerid].initialized2) {
			
 
				 		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
			
 
				 
			
 
				-		cudaStream_t stream = STARPUFFT(get_local_stream)(plan, workerid);
			
 
				-		cufftSetStream(plan->plans[workerid].plan2_cuda, stream);
			
 
				-
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 		plan->plans[workerid].initialized2 = 1;
			
 
				 	}
			
@@ -152,7 +140,7 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
				 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 
			
 
				-	cudaStreamSynchronize(plan->plans[workerid].stream);
			
 
				+	cudaThreadSynchronize();
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/starpufft/starpufftx2d.c
+++ b/examples/starpufft/starpufftx2d.c
@@ -39,10 +39,8 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 
				 	_cufftComplex * restrict in = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				 
			
 
				-	cudaStream_t stream = STARPUFFT(get_local_stream)(plan, starpu_worker_get_id());
			
 
				-
			
 
				-	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2, stream);
			
 
				-	cudaStreamSynchronize(stream);
			
 
				+	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
			
 
				+	cudaThreadSynchronize();
			
 
				 }
			
 
				 
			
 
				 /* Perform an n2,m2 fft */
			
@@ -66,27 +64,20 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
				 
			
 
				 	task_per_worker[workerid]++;
			
 
				 
			
 
				-	cudaStream_t stream;
			
 
				-
			
 
				 	if (!plan->plans[workerid].initialized1) {
			
 
				 		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
			
 
				 
			
 
				-		stream = STARPUFFT(get_local_stream)(plan, workerid);
			
 
				-		cufftSetStream(plan->plans[workerid].plan1_cuda, stream);
			
 
				-
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 		plan->plans[workerid].initialized1 = 1;
			
 
				 	}
			
 
				 
			
 
				-	stream = plan->plans[workerid].stream;
			
 
				-
			
 
				 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 
			
 
				 	/* synchronization is done after the twiddling */
			
 
				-	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j, stream);
			
 
				+	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
			
 
				 
			
 
				-	cudaStreamSynchronize(stream);
			
 
				+	cudaThreadSynchronize();
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -113,9 +104,6 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
				 	if (!plan->plans[workerid].initialized2) {
			
 
				 		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
			
 
				 
			
 
				-		cudaStream_t stream = STARPUFFT(get_local_stream)(plan, workerid);
			
 
				-		cufftSetStream(plan->plans[workerid].plan2_cuda, stream);
			
 
				-
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 		plan->plans[workerid].initialized2 = 1;
			
 
				 	}
			
@@ -125,7 +113,7 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 	}
			
 
				 
			
 
				-	cudaStreamSynchronize(plan->plans[workerid].stream);
			
 
				+	cudaThreadSynchronize();
			
 
				 }
			
 
				 #endif