Quellcode durchsuchen

do not use a separate stream for computations. this is not needed any more

Samuel Thibault vor 15 Jahren
Ursprung
Commit
d0142a949b

+ 18 - 18
examples/starpufft/cudax_kernels.cu

@@ -23,18 +23,18 @@
 	unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
 	unsigned numthreads = blockDim.x * gridDim.x;
 
-#define DISTRIB_1d(n, func,args,stream) \
+#define DISTRIB_1d(n, func,args) \
 	unsigned threads_per_block = 128; \
 \
 	if (n < threads_per_block) { \
 		dim3 dimGrid(n); \
-		func <<<dimGrid, 1, stream>>> args; \
+		func <<<dimGrid, 1>>> args; \
 	} else { \
 		dim3 dimGrid(n / threads_per_block); \
 		dim3 dimBlock(threads_per_block); \
-		func <<<dimGrid, dimBlock, stream>>> args; \
+		func <<<dimGrid, dimBlock>>> args; \
 	} \
-	cudaStreamSynchronize(stream); \
+	cudaThreadSynchronize(); \
 
 extern "C" __global__ void
 STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
@@ -48,9 +48,9 @@ STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i
 }
 
 extern "C" void
-STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream)
+STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
 {
-	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2), stream);
+	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2));
 }
 
 extern "C" __global__ void
@@ -66,9 +66,9 @@ STARPUFFT(cuda_twiddle_1d)(_cuComplex * out, const _cuComplex * roots, unsigned
 }
 
 extern "C" void
-STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream)
+STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i)
 {
-	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i), stream);
+	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i));
 }
 
 #define VARS_2d \
@@ -78,29 +78,29 @@ STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsign
 	unsigned numthreadsy = blockDim.y * gridDim.y;
 
 /* FIXME: introduce threads_per_dim_n / m instead */
-#define DISTRIB_2d(n, m, func, args, stream) \
+#define DISTRIB_2d(n, m, func, args) \
 	unsigned threads_per_dim = 16; \
 	if (n < threads_per_dim) { \
 		if (m < threads_per_dim) { \
 			dim3 dimGrid(n, m); \
-			func <<<dimGrid, 1, stream>>> args; \
+			func <<<dimGrid, 1>>> args; \
 		} else { \
 			dim3 dimGrid(1, m / threads_per_dim); \
 			dim3 dimBlock(n, threads_per_dim); \
-			func <<<dimGrid, dimBlock, stream>>> args; \
+			func <<<dimGrid, dimBlock>>> args; \
 		} \
 	} else {  \
 		if (m < threads_per_dim) { \
 			dim3 dimGrid(n / threads_per_dim, 1); \
 			dim3 dimBlock(threads_per_dim, m); \
-			func <<<dimGrid, dimBlock, stream>>> args; \
+			func <<<dimGrid, dimBlock>>> args; \
 		} else { \
 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
-			func <<<dimGrid, dimBlock, stream>>> args; \
+			func <<<dimGrid, dimBlock>>> args; \
 		} \
 	} \
-	cudaStreamSynchronize(stream);
+	cudaThreadSynchronize(); \
 
 extern "C" __global__ void
 STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
@@ -117,9 +117,9 @@ STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i
 }
 
 extern "C" void
-STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream)
+STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
 {
-	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2), stream);
+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2));
 }
 
 extern "C" __global__ void
@@ -137,7 +137,7 @@ STARPUFFT(cuda_twiddle_2d)(_cuComplex * out, const _cuComplex * roots0, const _c
 }
 
 extern "C" void
-STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream)
+STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
 {
-	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j), stream);
+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j));
 }

+ 4 - 4
examples/starpufft/cudax_kernels.h

@@ -15,7 +15,7 @@
  */
 
 #include <cuComplex.h>
-_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream);
-_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream);
-_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream);
-_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream);
+_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
+_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
+_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
+_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j);

+ 0 - 20
examples/starpufft/starpufftx.c

@@ -85,11 +85,6 @@ struct STARPUFFT(plan) {
 		cufftHandle plan1_cuda, plan2_cuda;
 		/* Whether the plans above are initialized */
 		int initialized1, initialized2;
-		/* The stream used on that GPU: FIXME: is this really still
-		 * needed? */
-		cudaStream_t stream;
-		/* Whether the stream above is initialized */
-		int stream_is_initialized;
 #endif
 #ifdef STARPU_HAVE_FFTW
 		/* FFTW plans */
@@ -119,21 +114,6 @@ struct STARPUFFT(args) {
 	int i, j, jj, kk, ll, *iv, *kkv;
 };
 
-#ifdef STARPU_USE_CUDA
-cudaStream_t
-STARPUFFT(get_local_stream)(STARPUFFT(plan) plan, int workerid)
-{
-	if (!plan->plans[workerid].stream_is_initialized)
-	{
-		cudaStreamCreate(&plan->plans[workerid].stream);
-
-		plan->plans[workerid].stream_is_initialized = 1;
-	}
-
-	return plan->plans[workerid].stream;
-}
-#endif
-
 static void
 check_dims(STARPUFFT(plan) plan)
 {

+ 5 - 17
examples/starpufft/starpufftx1d.c

@@ -69,11 +69,9 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 	_cufftComplex * restrict in = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[0]);
 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[1]);
 	
-	cudaStream_t stream = STARPUFFT(get_local_stream)(plan, starpu_worker_get_id());
+	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
 
-	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2, stream);
-
-	cudaStreamSynchronize(stream);
+	cudaThreadSynchronize();
 }
 
 /* fft1:
@@ -96,26 +94,19 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
 	task_per_worker[workerid]++;
 
-	cudaStream_t stream;
-
 	if (!plan->plans[workerid].initialized1) {
 		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
 
-		stream = STARPUFFT(get_local_stream)(plan, workerid);
-		cufftSetStream(plan->plans[workerid].plan1_cuda, stream);
-
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized1 = 1;
 	}
 
-	stream = plan->plans[workerid].stream;
-
 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
-	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i, stream);
+	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
 
-	cudaStreamSynchronize(plan->plans[workerid].stream);
+	cudaThreadSynchronize();
 }
 
 /* fft2:
@@ -141,9 +132,6 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 	if (!plan->plans[workerid].initialized2) {
 		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
 
-		cudaStream_t stream = STARPUFFT(get_local_stream)(plan, workerid);
-		cufftSetStream(plan->plans[workerid].plan2_cuda, stream);
-
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized2 = 1;
 	}
@@ -152,7 +140,7 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
-	cudaStreamSynchronize(plan->plans[workerid].stream);
+	cudaThreadSynchronize();
 }
 #endif
 

+ 5 - 17
examples/starpufft/starpufftx2d.c

@@ -39,10 +39,8 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 	_cufftComplex * restrict in = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[0]);
 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_GET_VECTOR_PTR(descr[1]);
 
-	cudaStream_t stream = STARPUFFT(get_local_stream)(plan, starpu_worker_get_id());
-
-	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2, stream);
-	cudaStreamSynchronize(stream);
+	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
+	cudaThreadSynchronize();
 }
 
 /* Perform an n2,m2 fft */
@@ -66,27 +64,20 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
 	task_per_worker[workerid]++;
 
-	cudaStream_t stream;
-
 	if (!plan->plans[workerid].initialized1) {
 		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
 
-		stream = STARPUFFT(get_local_stream)(plan, workerid);
-		cufftSetStream(plan->plans[workerid].plan1_cuda, stream);
-
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized1 = 1;
 	}
 
-	stream = plan->plans[workerid].stream;
-
 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
 	/* synchronization is done after the twiddling */
-	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j, stream);
+	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
 
-	cudaStreamSynchronize(stream);
+	cudaThreadSynchronize();
 }
 
 static void
@@ -113,9 +104,6 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 	if (!plan->plans[workerid].initialized2) {
 		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
 
-		cudaStream_t stream = STARPUFFT(get_local_stream)(plan, workerid);
-		cufftSetStream(plan->plans[workerid].plan2_cuda, stream);
-
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized2 = 1;
 	}
@@ -125,7 +113,7 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 	}
 
-	cudaStreamSynchronize(plan->plans[workerid].stream);
+	cudaThreadSynchronize();
 }
 #endif