13 年之前 · ae6b9df97e
--- a/examples/starpufft/starpufftx.c
+++ b/examples/starpufft/starpufftx.c
@@ -31,6 +31,9 @@
 
																 #define _FFTW_FLAGS FFTW_ESTIMATE
															
 
																+#define PARALLEL
															
 
																+#ifdef PARALLEL
															
 
																+
															
 
																 enum steps {
															
 
																 	SPECIAL, TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END
															
 
																 };
															
@@ -45,6 +48,8 @@ enum steps {
 
																 #define I_BITS STEP_SHIFT
															
 
																+#endif /* PARALLEL */
															
 
																+
															
 
																 enum type {
															
 
																 	R2C,
															
 
																 	C2R,
															
@@ -84,8 +89,6 @@ struct STARPUFFT(plan) {
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		/* CUFFT plans */
															
 
																 		cufftHandle plan1_cuda, plan2_cuda;
															
 
																-		/* Whether the plans above are initialized */
															
 
																-		int initialized1, initialized2;
															
 
																 #endif
															
 
																 #ifdef STARPU_HAVE_FFTW
															
 
																 		/* FFTW plans */
															
--- a/examples/starpufft/starpufftx1d.c
+++ b/examples/starpufft/starpufftx1d.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -15,6 +15,11 @@
 
																  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																  */
															
 
																+#define PARALLEL
															
 
																+#ifdef PARALLEL
															
 
																+
															
 
																+/* Dumb parallel version, disabled */
															
 
																+
															
 
																 #define DIV_1D 64
															
 
																   /*
															
@@ -79,6 +84,20 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 
																  *
															
 
																  * Perform one fft of size n2 */
															
 
																 static void
															
 
																+STARPUFFT(fft1_1d_plan_gpu)(void *args)
															
 
																+{
															
 
																+	STARPUFFT(plan) plan = args;
															
 
																+	cufftResult cures;
															
 
																+	int n2 = plan->n2[0];
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
															
 
																+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
															
 
																+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+}
															
 
																+
															
 
																+static void
															
 
																 STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
															
 
																 {
															
 
																 	struct STARPUFFT(args) *args = _args;
															
@@ -95,15 +114,6 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	task_per_worker[workerid]++;
															
 
																-	if (!plan->plans[workerid].initialized1) {
															
 
																-		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
															
 
																-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
															
 
																-
															
 
																-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-		plan->plans[workerid].initialized1 = 1;
															
 
																-	}
															
 
																-
															
 
																 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
															
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
@@ -116,13 +126,26 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
																  *
															
 
																  * Perform n3 = n2/DIV_1D ffts of size n1 */
															
 
																 static void
															
 
																-STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
															
 
																+STARPUFFT(fft2_1d_plan_gpu)(void *args)
															
 
																 {
															
 
																-	struct STARPUFFT(args) *args = _args;
															
 
																-	STARPUFFT(plan) plan = args->plan;
															
 
																+	STARPUFFT(plan) plan = args;
															
 
																+	cufftResult cures;
															
 
																 	int n1 = plan->n1[0];
															
 
																 	int n2 = plan->n2[0];
															
 
																 	int n3 = n2/DIV_1D;
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
															
 
																+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
															
 
																+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+}
															
 
																+
															
 
																+static void
															
 
																+STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
															
 
																+{
															
 
																+	struct STARPUFFT(args) *args = _args;
															
 
																+	STARPUFFT(plan) plan = args->plan;
															
 
																 	cufftResult cures;
															
 
																 	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
															
@@ -132,15 +155,6 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	task_per_worker[workerid]++;
															
 
																-	if (!plan->plans[workerid].initialized2) {
															
 
																-		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
															
 
																-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
															
 
																-
															
 
																-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-		plan->plans[workerid].initialized2 = 1;
															
 
																-	}
															
 
																-
															
 
																 	/* NOTE using batch support */
															
 
																 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
															
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
@@ -380,6 +394,8 @@ static struct starpu_codelet STARPUFFT(twist3_1d_codelet) = {
 
																 	.nbuffers = 1
															
 
																 };
															
 
																+#endif /* PARALLEL */
															
 
																+
															
 
																 /* Planning:
															
 
																  *
															
 
																  * - For each CPU worker, we need to plan the two fftw stages.
															
@@ -480,18 +496,16 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 
																 #endif
															
 
																 			break;
															
 
																 		case STARPU_CUDA_WORKER:
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-			/* Perform CUFFT planning lazily. */
															
 
																-			plan->plans[workerid].initialized1 = 0;
															
 
																-			plan->plans[workerid].initialized2 = 0;
															
 
																-#endif
															
 
																-
															
 
																 			break;
															
 
																 		default:
															
 
																 			STARPU_ABORT();
															
 
																 			break;
															
 
																 		}
															
 
																 	}
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	starpu_execute_on_each_worker(STARPUFFT(fft1_1d_plan_gpu), plan, STARPU_CUDA);
															
 
																+	starpu_execute_on_each_worker(STARPUFFT(fft2_1d_plan_gpu), plan, STARPU_CUDA);
															
 
																+#endif
															
 
																 	/* Allocate buffers. */
															
 
																 	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
															
--- a/examples/starpufft/starpufftx2d.c
+++ b/examples/starpufft/starpufftx2d.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -15,6 +15,8 @@
 
																  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																  */
															
 
																+#define PARALLEL
															
 
																+#ifdef PARALLEL
															
 
																 #define DIV_2D_N 8
															
 
																 #define DIV_2D_M 8
															
@@ -44,7 +46,24 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																-/* Perform an n2,m2 fft */
															
 
																+/* fft1:
															
 
																+ *
															
 
																+ * Perform one fft of size n2,m2 */
															
 
																+static void
															
 
																+STARPUFFT(fft1_2d_plan_gpu)(void *args)
															
 
																+{
															
 
																+	STARPUFFT(plan) plan = args;
															
 
																+	int n2 = plan->n2[0];
															
 
																+	int m2 = plan->n2[1];
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+	cufftResult cures;
															
 
																+
															
 
																+	cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
															
 
																+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
															
 
																+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+}
															
 
																+
															
 
																 static void
															
 
																 STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
															
 
																 {
															
@@ -65,15 +84,6 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	task_per_worker[workerid]++;
															
 
																-	if (!plan->plans[workerid].initialized1) {
															
 
																-		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
															
 
																-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
															
 
																-
															
 
																-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-		plan->plans[workerid].initialized1 = 1;
															
 
																-	}
															
 
																-
															
 
																 	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
															
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
@@ -83,6 +93,24 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																+/* fft2:
															
 
																+ *
															
 
																+ * Perform n3*m3 ffts of size n1,m1 */
															
 
																+static void
															
 
																+STARPUFFT(fft2_2d_plan_gpu(void *args))
															
 
																+{
															
 
																+	STARPUFFT(plan) plan = args;
															
 
																+	int n1 = plan->n1[0];
															
 
																+	int m1 = plan->n1[1];
															
 
																+	cufftResult cures;
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
															
 
																+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
															
 
																+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+}
															
 
																+
															
 
																 static void
															
 
																 STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
															
 
																 {
															
@@ -104,15 +132,6 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	task_per_worker[workerid]++;
															
 
																-	if (!plan->plans[workerid].initialized2) {
															
 
																-		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
															
 
																-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
															
 
																-
															
 
																-		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-		plan->plans[workerid].initialized2 = 1;
															
 
																-	}
															
 
																-
															
 
																 	for (n = 0; n < n3*m3; n++) {
															
 
																 		cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in + n * n1*m1, out + n * n1*m1, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
@@ -362,6 +381,7 @@ static struct starpu_codelet STARPUFFT(twist3_2d_codelet) = {
 
																 	.model = &STARPUFFT(twist3_2d_model),
															
 
																 	.nbuffers = 1
															
 
																 };
															
 
																+#endif
															
 
																 STARPUFFT(plan)
															
 
																 STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
															
@@ -477,16 +497,16 @@ STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
 
																 #endif
															
 
																 			break;
															
 
																 		case STARPU_CUDA_WORKER:
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-			plan->plans[workerid].initialized1 = 0;
															
 
																-			plan->plans[workerid].initialized2 = 0;
															
 
																-#endif
															
 
																 			break;
															
 
																 		default:
															
 
																 			STARPU_ABORT();
															
 
																 			break;
															
 
																 		}
															
 
																 	}
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	starpu_execute_on_each_worker(STARPUFFT(fft1_2d_plan_gpu), plan, STARPU_CUDA);
															
 
																+	starpu_execute_on_each_worker(STARPUFFT(fft2_2d_plan_gpu), plan, STARPU_CUDA);
															
 
																+#endif
															
 
																 	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
															
 
																 	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));