瀏覽代碼

Make cufft plans during starpufft plan, not lazily (since that'd break performance measurements)

Samuel Thibault 13 年之前
父節點
當前提交
ae6b9df97e
共有 3 個文件被更改,包括 91 次插入54 次删除
  1. 5 2
      examples/starpufft/starpufftx.c
  2. 42 28
      examples/starpufft/starpufftx1d.c
  3. 44 24
      examples/starpufft/starpufftx2d.c

+ 5 - 2
examples/starpufft/starpufftx.c

@@ -31,6 +31,9 @@
 
 #define _FFTW_FLAGS FFTW_ESTIMATE
 
+#define PARALLEL
+#ifdef PARALLEL
+
 enum steps {
 	SPECIAL, TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END
 };
@@ -45,6 +48,8 @@ enum steps {
 
 #define I_BITS STEP_SHIFT
 
+#endif /* PARALLEL */
+
 enum type {
 	R2C,
 	C2R,
@@ -84,8 +89,6 @@ struct STARPUFFT(plan) {
 #ifdef STARPU_USE_CUDA
 		/* CUFFT plans */
 		cufftHandle plan1_cuda, plan2_cuda;
-		/* Whether the plans above are initialized */
-		int initialized1, initialized2;
 #endif
 #ifdef STARPU_HAVE_FFTW
 		/* FFTW plans */

+ 42 - 28
examples/starpufft/starpufftx1d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,6 +15,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#define PARALLEL
+#ifdef PARALLEL
+
+/* Dumb parallel version, disabled */
+
 #define DIV_1D 64
 
   /*
@@ -79,6 +84,20 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
  *
  * Perform one fft of size n2 */
 static void
+STARPUFFT(fft1_1d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+	int n2 = plan->n2[0];
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
 STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 {
 	struct STARPUFFT(args) *args = _args;
@@ -95,15 +114,6 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
 	task_per_worker[workerid]++;
 
-	if (!plan->plans[workerid].initialized1) {
-		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
-		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
-
-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
-		plan->plans[workerid].initialized1 = 1;
-	}
-
 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
@@ -116,13 +126,26 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
  *
  * Perform n3 = n2/DIV_1D ffts of size n1 */
 static void
-STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
+STARPUFFT(fft2_1d_plan_gpu)(void *args)
 {
-	struct STARPUFFT(args) *args = _args;
-	STARPUFFT(plan) plan = args->plan;
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
 	int n1 = plan->n1[0];
 	int n2 = plan->n2[0];
 	int n3 = n2/DIV_1D;
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
 	cufftResult cures;
 
 	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
@@ -132,15 +155,6 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
 	task_per_worker[workerid]++;
 
-	if (!plan->plans[workerid].initialized2) {
-		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
-		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
-
-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
-		plan->plans[workerid].initialized2 = 1;
-	}
-
 	/* NOTE using batch support */
 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
@@ -380,6 +394,8 @@ static struct starpu_codelet STARPUFFT(twist3_1d_codelet) = {
 	.nbuffers = 1
 };
 
+#endif /* PARALLEL */
+
 /* Planning:
  *
  * - For each CPU worker, we need to plan the two fftw stages.
@@ -480,18 +496,16 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 #endif
 			break;
 		case STARPU_CUDA_WORKER:
-#ifdef STARPU_USE_CUDA
-			/* Perform CUFFT planning lazily. */
-			plan->plans[workerid].initialized1 = 0;
-			plan->plans[workerid].initialized2 = 0;
-#endif
-
 			break;
 		default:
 			STARPU_ABORT();
 			break;
 		}
 	}
+#ifdef STARPU_USE_CUDA
+	starpu_execute_on_each_worker(STARPUFFT(fft1_1d_plan_gpu), plan, STARPU_CUDA);
+	starpu_execute_on_each_worker(STARPUFFT(fft2_1d_plan_gpu), plan, STARPU_CUDA);
+#endif
 
 	/* Allocate buffers. */
 	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));

+ 44 - 24
examples/starpufft/starpufftx2d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,6 +15,8 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#define PARALLEL
+#ifdef PARALLEL
 #define DIV_2D_N 8
 #define DIV_2D_M 8
 
@@ -44,7 +46,24 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
-/* Perform an n2,m2 fft */
+/* fft1:
+ *
+ * Perform one fft of size n2,m2 */
+static void
+STARPUFFT(fft1_2d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	int workerid = starpu_worker_get_id();
+	cufftResult cures;
+
+	cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
 static void
 STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 {
@@ -65,15 +84,6 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
 	task_per_worker[workerid]++;
 
-	if (!plan->plans[workerid].initialized1) {
-		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
-		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
-
-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
-		plan->plans[workerid].initialized1 = 1;
-	}
-
 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
@@ -83,6 +93,24 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
+/* fft2:
+ *
+ * Perform n3*m3 ffts of size n1,m1 */
+static void
+STARPUFFT(fft2_2d_plan_gpu(void *args))
+{
+	STARPUFFT(plan) plan = args;
+	int n1 = plan->n1[0];
+	int m1 = plan->n1[1];
+	cufftResult cures;
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
 static void
 STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 {
@@ -104,15 +132,6 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
 	task_per_worker[workerid]++;
 
-	if (!plan->plans[workerid].initialized2) {
-		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
-		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
-
-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
-		plan->plans[workerid].initialized2 = 1;
-	}
-
 	for (n = 0; n < n3*m3; n++) {
 		cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in + n * n1*m1, out + n * n1*m1, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
@@ -362,6 +381,7 @@ static struct starpu_codelet STARPUFFT(twist3_2d_codelet) = {
 	.model = &STARPUFFT(twist3_2d_model),
 	.nbuffers = 1
 };
+#endif
 
 STARPUFFT(plan)
 STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
@@ -477,16 +497,16 @@ STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
 #endif
 			break;
 		case STARPU_CUDA_WORKER:
-#ifdef STARPU_USE_CUDA
-			plan->plans[workerid].initialized1 = 0;
-			plan->plans[workerid].initialized2 = 0;
-#endif
 			break;
 		default:
 			STARPU_ABORT();
 			break;
 		}
 	}
+#ifdef STARPU_USE_CUDA
+	starpu_execute_on_each_worker(STARPUFFT(fft1_2d_plan_gpu), plan, STARPU_CUDA);
+	starpu_execute_on_each_worker(STARPUFFT(fft2_2d_plan_gpu), plan, STARPU_CUDA);
+#endif
 
 	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
 	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));