Browse Source

add more comments to starpufft

Samuel Thibault 15 years ago
parent
commit
46ed34d75a

+ 1 - 0
examples/starpufft/starpufft_common.c

@@ -16,4 +16,5 @@
 
 
 #include "starpufft.h"
 #include "starpufft.h"
 
 
+/* Used as an identifier in starpu tags to let plans run concurrently */
 int starpufft_last_plan_number;
 int starpufft_last_plan_number;

+ 15 - 7
examples/starpufft/starpufftx.c

@@ -78,29 +78,39 @@ struct STARPUFFT(plan) {
 	STARPUFFT(complex) *roots[2];
 	STARPUFFT(complex) *roots[2];
 	starpu_data_handle roots_handle[2];
 	starpu_data_handle roots_handle[2];
 
 
+	/* For each worker, we need some data */
 	struct {
 	struct {
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
+		/* CUFFT plans */
 		cufftHandle plan1_cuda, plan2_cuda;
 		cufftHandle plan1_cuda, plan2_cuda;
+		/* Whether the plans above are initialized */
 		int initialized1, initialized2;
 		int initialized1, initialized2;
+		/* The stream used on that GPU: FIXME: is this really still
+		 * needed? */
 		cudaStream_t stream;
 		cudaStream_t stream;
+		/* Whether the stream above is initialized */
 		int stream_is_initialized;
 		int stream_is_initialized;
 #endif
 #endif
 #ifdef STARPU_HAVE_FFTW
 #ifdef STARPU_HAVE_FFTW
+		/* FFTW plans */
 		_fftw_plan plan1_cpu, plan2_cpu;
 		_fftw_plan plan1_cpu, plan2_cpu;
+		/* Buffers used by the plans above */
 		_fftw_complex *in1, *out1;
 		_fftw_complex *in1, *out1;
 		_fftw_complex *in2, *out2;
 		_fftw_complex *in2, *out2;
 #endif
 #endif
 	} plans[STARPU_NMAXWORKERS];
 	} plans[STARPU_NMAXWORKERS];
 
 
-#ifdef STARPU_HAVE_FFTW
-	_fftw_plan plan_gather;
-#endif
-
+	/* Buffers for codelets */
 	STARPUFFT(complex) *in, *twisted1, *fft1, *twisted2, *fft2, *out;
 	STARPUFFT(complex) *in, *twisted1, *fft1, *twisted2, *fft2, *out;
 
 
+	/* corresponding starpu DSM handles */
 	starpu_data_handle in_handle, *twisted1_handle, *fft1_handle, *twisted2_handle, *fft2_handle;
 	starpu_data_handle in_handle, *twisted1_handle, *fft1_handle, *twisted2_handle, *fft2_handle;
+
+	/* Tasks */
 	struct starpu_task **twist1_tasks, **fft1_tasks, **twist2_tasks, **fft2_tasks, **twist3_tasks;
 	struct starpu_task **twist1_tasks, **fft1_tasks, **twist2_tasks, **fft2_tasks, **twist3_tasks;
 	struct starpu_task *join_task, *end_task;
 	struct starpu_task *join_task, *end_task;
+
+	/* Arguments for tasks */
 	struct STARPUFFT(args) *fft1_args, *fft2_args;
 	struct STARPUFFT(args) *fft1_args, *fft2_args;
 };
 };
 
 
@@ -227,6 +237,7 @@ STARPUFFT(execute)(STARPUFFT(plan) plan, void *in, void *out)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 }
 }
 
 
+/* Destroy FFTW plans, unregister and free buffers, and free tags */
 void
 void
 STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
 STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
 {
 {
@@ -310,9 +321,6 @@ STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
 	STARPUFFT(free)(plan->fft1);
 	STARPUFFT(free)(plan->fft1);
 	STARPUFFT(free)(plan->twisted2);
 	STARPUFFT(free)(plan->twisted2);
 	STARPUFFT(free)(plan->fft2);
 	STARPUFFT(free)(plan->fft2);
-#ifdef STARPU_HAVE_FFTW
-	_FFTW(destroy_plan)(plan->plan_gather);
-#endif
 	free(plan);
 	free(plan);
 }
 }
 
 

+ 114 - 35
examples/starpufft/starpufftx1d.c

@@ -16,10 +16,47 @@
 
 
 #define DIV_1D 64
 #define DIV_1D 64
 
 
+  /*
+   * Overall strategy for an fft of size n:
+   * - perform n1 ffts of size n2
+   * - twiddle
+   * - perform n2 ffts of size n1
+   *
+   * - n1 defaults to DIV_1D, thus n2 defaults to n / DIV_1D.
+   *
+   * Precise tasks:
+   *
+   * - twist1: twist the whole n-element input (called "in") into n1 chunks of
+   *           size n2, by using n1 tasks taking the whole n-element input as a
+   *           R parameter and one n2 output as a W parameter. The result is
+   *           called twisted1.
+   * - fft1:   perform n1 (n2) ffts, by using n1 tasks doing one fft each. Also
+   *           twiddle the result to prepare for the fft2. The result is called
+   *           fft1.
+   * - join:   depends on all the fft1s, to gather the n1 results of size n2 in
+   *           the fft1 vector.
+   * - twist2: twist the fft1 vector into n2 chunks of size n1, called twisted2.
+   *           since n2 is typically very large, this step is divided in DIV_1D
+   *           tasks, each of them performing n2/DIV_1D of them
+   * - fft2:   perform n2 ffts of size n1. This is divided in DIV_1D tasks of
+   *           n2/DIV_1D ffts, to be performed in batches. The result is called
+   *           fft2.
+   * - twist3: twist back the result of the fft2s above into the output buffer.
+   *           Only implemented on CPUs for simplicity of the gathering.
+   *
+   * The tag space thus uses 3 dimensions:
+   * - the number of the plan.
+   * - the step (TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END)
+   * - an index i between 0 and DIV_1D-1.
+   */
+
 #define STEP_TAG_1D(plan, step, i) _STEP_TAG(plan, step, i)
 #define STEP_TAG_1D(plan, step, i) _STEP_TAG(plan, step, i)
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-/* Twist the full vector into a n2 chunk */
+/* twist1:
+ *
+ * Twist the full input vector (first parameter) into one chunk of size n2
+ * (second parameter) */
 static void
 static void
 STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 {
 {
@@ -39,7 +76,9 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 	cudaStreamSynchronize(stream);
 	cudaStreamSynchronize(stream);
 }
 }
 
 
-/* Perform an n2 fft */
+/* fft1:
+ *
+ * Perform one fft of size n2 */
 static void
 static void
 STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 {
 {
@@ -77,6 +116,9 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 	cudaStreamSynchronize(plan->plans[workerid].stream);
 	cudaStreamSynchronize(plan->plans[workerid].stream);
 }
 }
 
 
+/* fft2:
+ *
+ * Perform n3 = n2/DIV_1D ffts of size n1 */
 static void
 static void
 STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 {
 {
@@ -110,7 +152,10 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 }
 }
 #endif
 #endif
 
 
-/* Twist the full vector into a n2 chunk */
+/* twist1:
+ *
+ * Twist the full input vector (first parameter) into one chunk of size n2
+ * (second parameter) */
 static void
 static void
 STARPUFFT(twist1_1d_kernel_cpu)(void *descr[], void *_args)
 STARPUFFT(twist1_1d_kernel_cpu)(void *descr[], void *_args)
 {
 {
@@ -131,7 +176,9 @@ STARPUFFT(twist1_1d_kernel_cpu)(void *descr[], void *_args)
 }
 }
 
 
 #ifdef STARPU_HAVE_FFTW
 #ifdef STARPU_HAVE_FFTW
-/* Perform an n2 fft */
+/* fft1:
+ *
+ * Perform one fft of size n2 */
 static void
 static void
 STARPUFFT(fft1_1d_kernel_cpu)(void *descr[], void *_args)
 STARPUFFT(fft1_1d_kernel_cpu)(void *descr[], void *_args)
 {
 {
@@ -153,12 +200,16 @@ STARPUFFT(fft1_1d_kernel_cpu)(void *descr[], void *_args)
 	memcpy(worker_in1, twisted1, plan->totsize2 * sizeof(*worker_in1));
 	memcpy(worker_in1, twisted1, plan->totsize2 * sizeof(*worker_in1));
 	_FFTW(execute)(plan->plans[workerid].plan1_cpu);
 	_FFTW(execute)(plan->plans[workerid].plan1_cpu);
 
 
+	/* twiddle while copying from fftw output buffer to fft1 buffer */
 	for (j = 0; j < n2; j++)
 	for (j = 0; j < n2; j++)
 		fft1[j] = worker_out1[j] * plan->roots[0][i*j];
 		fft1[j] = worker_out1[j] * plan->roots[0][i*j];
 }
 }
 #endif
 #endif
 
 
-/* Twist the full vector into a package of n2/DIV_1D (n1) chunks */
+/* twist2:
+ *
+ * Twist the full vector (results of the fft1s) into one package of n2/DIV_1D
+ * chunks of size n1 */
 static void
 static void
 STARPUFFT(twist2_1d_kernel_cpu)(void *descr[], void *_args)
 STARPUFFT(twist2_1d_kernel_cpu)(void *descr[], void *_args)
 {
 {
@@ -183,7 +234,9 @@ STARPUFFT(twist2_1d_kernel_cpu)(void *descr[], void *_args)
 }
 }
 
 
 #ifdef STARPU_HAVE_FFTW
 #ifdef STARPU_HAVE_FFTW
-/* Perform n2/DIV_1D (n1) ffts */
+/* fft2:
+ *
+ * Perform n3 = n2/DIV_1D ffts of size n1 */
 static void
 static void
 STARPUFFT(fft2_1d_kernel_cpu)(void *descr[], void *_args)
 STARPUFFT(fft2_1d_kernel_cpu)(void *descr[], void *_args)
 {
 {
@@ -207,7 +260,9 @@ STARPUFFT(fft2_1d_kernel_cpu)(void *descr[], void *_args)
 }
 }
 #endif
 #endif
 
 
-/* Spread the package of n2/DIV_1D (n1) chunks into the full vector */
+/* twist3:
+ *
+ * Spread the package of n2/DIV_1D chunks of size n1 into the output vector */
 static void
 static void
 STARPUFFT(twist3_1d_kernel_cpu)(void *descr[], void *_args)
 STARPUFFT(twist3_1d_kernel_cpu)(void *descr[], void *_args)
 {
 {
@@ -231,6 +286,7 @@ STARPUFFT(twist3_1d_kernel_cpu)(void *descr[], void *_args)
 	}
 	}
 }
 }
 
 
+/* Performance models for the 5 kinds of tasks */
 static struct starpu_perfmodel_t STARPUFFT(twist1_1d_model) = {
 static struct starpu_perfmodel_t STARPUFFT(twist1_1d_model) = {
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
 	.symbol = TYPE"twist1_1d"
 	.symbol = TYPE"twist1_1d"
@@ -256,6 +312,7 @@ static struct starpu_perfmodel_t STARPUFFT(twist3_1d_model) = {
 	.symbol = TYPE"twist3_1d"
 	.symbol = TYPE"twist3_1d"
 };
 };
 
 
+/* codelet pointers for the 5 kinds of tasks */
 static starpu_codelet STARPUFFT(twist1_1d_codelet) = {
 static starpu_codelet STARPUFFT(twist1_1d_codelet) = {
 	.where =
 	.where =
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -322,6 +379,16 @@ static starpu_codelet STARPUFFT(twist3_1d_codelet) = {
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 
+/* Planning:
+ *
+ * - For each CPU worker, we need to plan the two fftw stages.
+ * - For GPU workers, we need to do the planning in the CUDA context, so we do
+ *   this lazily through the initialised1 and initialised2 flags ; TODO: use
+ *   starpu_execute_on_each_worker instead (done in the omp branch).
+ * - We allocate all the temporary buffers and register them to starpu.
+ * - We create all the tasks, but do not submit them yet. It will be possible
+ *   to reuse them at will to perform several ffts with the same planning.
+ */
 STARPUFFT(plan)
 STARPUFFT(plan)
 STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 {
 {
@@ -332,17 +399,6 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 	int z;
 	int z;
 	struct starpu_task *task;
 	struct starpu_task *task;
 
 
-	/*
-	 * Simple strategy:
-	 *
-	 * - twist1: twist input in n1 (n2) chunks
-	 * - fft1:   perform n1 (n2) ffts
-	 * - twist2: twist into n2 (n1) chunks distributed in
-	 *           DIV_1D groups
-	 * - fft2:   perform DIV_1D times n3 (n1) ffts
-	 * - twist3: twist back into output
-	 */
-
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* cufft 1D limited to 8M elements */
 	/* cufft 1D limited to 8M elements */
 	while (n2 > 8 << 20) {
 	while (n2 > 8 << 20) {
@@ -365,9 +421,10 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 
 
 	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
 	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
 
 
-	/* 4bit limitation in the tag space */
+	/* The plan number has a limited size */
 	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
 	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
 
 
+	/* Just one dimension */
 	plan->dim = 1;
 	plan->dim = 1;
 	plan->n = malloc(plan->dim * sizeof(*plan->n));
 	plan->n = malloc(plan->dim * sizeof(*plan->n));
 	plan->n[0] = n;
 	plan->n[0] = n;
@@ -378,6 +435,8 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 	plan->n1[0] = n1;
 	plan->n1[0] = n1;
 	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
 	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
 	plan->n2[0] = n2;
 	plan->n2[0] = n2;
+
+	/* Note: this is for coherency with the 2D case */
 	plan->totsize = n;
 	plan->totsize = n;
 	plan->totsize1 = n1;
 	plan->totsize1 = n1;
 	plan->totsize2 = n2;
 	plan->totsize2 = n2;
@@ -386,6 +445,7 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 	plan->type = C2C;
 	plan->type = C2C;
 	plan->sign = sign;
 	plan->sign = sign;
 
 
+	/* Compute the w^k just once. */
 	compute_roots(plan);
 	compute_roots(plan);
 
 
 	/* Initialize per-worker working set */
 	/* Initialize per-worker working set */
@@ -393,7 +453,9 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 		switch (starpu_worker_get_type(workerid)) {
 		switch (starpu_worker_get_type(workerid)) {
 		case STARPU_CPU_WORKER:
 		case STARPU_CPU_WORKER:
 #ifdef STARPU_HAVE_FFTW
 #ifdef STARPU_HAVE_FFTW
-			/* first fft plan: one n2 fft */
+			/* first fft plan: one fft of size n2.
+			 * FFTW imposes that buffer pointers are known at
+			 * planning time. */
 			plan->plans[workerid].in1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex));
 			plan->plans[workerid].in1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex));
 			memset(plan->plans[workerid].in1, 0, plan->totsize2 * sizeof(_fftw_complex));
 			memset(plan->plans[workerid].in1, 0, plan->totsize2 * sizeof(_fftw_complex));
 			plan->plans[workerid].out1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex));
 			plan->plans[workerid].out1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex));
@@ -401,7 +463,7 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_1d)(n2, plan->plans[workerid].in1, plan->plans[workerid].out1, sign, _FFTW_FLAGS);
 			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_1d)(n2, plan->plans[workerid].in1, plan->plans[workerid].out1, sign, _FFTW_FLAGS);
 			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
 			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
 
 
-			/* second fft plan: n3 n1 ffts */
+			/* second fft plan: n3 ffts of size n1 */
 			plan->plans[workerid].in2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex));
 			plan->plans[workerid].in2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex));
 			memset(plan->plans[workerid].in2, 0, plan->totsize4 * sizeof(_fftw_complex));
 			memset(plan->plans[workerid].in2, 0, plan->totsize4 * sizeof(_fftw_complex));
 			plan->plans[workerid].out2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex));
 			plan->plans[workerid].out2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex));
@@ -418,9 +480,11 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 			break;
 			break;
 		case STARPU_CUDA_WORKER:
 		case STARPU_CUDA_WORKER:
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
+			/* Perform CUFFT planning lazily. */
 			plan->plans[workerid].initialized1 = 0;
 			plan->plans[workerid].initialized1 = 0;
 			plan->plans[workerid].initialized2 = 0;
 			plan->plans[workerid].initialized2 = 0;
 #endif
 #endif
+
 			break;
 			break;
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
@@ -428,6 +492,7 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 		}
 		}
 	}
 	}
 
 
+	/* Allocate buffers. */
 	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
 	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
 	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
 	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
 	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
 	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
@@ -437,21 +502,24 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
 	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
 	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
 	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
 
 
+	/* Allocate handle arrays */
 	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
 	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
 	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
 	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
 	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
 	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
 	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
 	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
 
 
+	/* Allocate task arrays */
 	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
 	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
 	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
 	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
 	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
 	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
 	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
 	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
 	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
 	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
 
 
+	/* Allocate codelet argument arrays */
 	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
 	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
 	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
 	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
 
 
-	/* Create first-round tasks */
+	/* Create first-round tasks: DIV_1D tasks of type twist1 and fft1 */
 	for (z = 0; z < plan->totsize1; z++) {
 	for (z = 0; z < plan->totsize1; z++) {
 		int i = z;
 		int i = z;
 #define STEP_TAG(step)	STEP_TAG_1D(plan, step, i)
 #define STEP_TAG(step)	STEP_TAG_1D(plan, step, i)
@@ -459,17 +527,21 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 		plan->fft1_args[z].plan = plan;
 		plan->fft1_args[z].plan = plan;
 		plan->fft1_args[z].i = i;
 		plan->fft1_args[z].i = i;
 
 
-		/* Register (n2) chunks */
+		/* Register the twisted1 buffer of size n2. */
 		starpu_vector_data_register(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
 		starpu_vector_data_register(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
+		/* Register the fft1 buffer of size n2. */
 		starpu_vector_data_register(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
 		starpu_vector_data_register(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
 
 
-		/* We'll need it on the CPU for the second twist anyway */
+		/* We'll need the result of fft1 on the CPU for the second
+		 * twist anyway, so tell starpu to not keep the fft1 buffer in
+		 * the GPU. */
 		starpu_data_set_wb_mask(plan->fft1_handle[z], 1<<0);
 		starpu_data_set_wb_mask(plan->fft1_handle[z], 1<<0);
 
 
 		/* Create twist1 task */
 		/* Create twist1 task */
 		plan->twist1_tasks[z] = task = starpu_task_create();
 		plan->twist1_tasks[z] = task = starpu_task_create();
 		task->cl = &STARPUFFT(twist1_1d_codelet);
 		task->cl = &STARPUFFT(twist1_1d_codelet);
-		//task->buffers[0].handle = to be filled at execution
+		//task->buffers[0].handle = to be filled at execution to point
+		//to the application input.
 		task->buffers[0].mode = STARPU_R;
 		task->buffers[0].mode = STARPU_R;
 		task->buffers[1].handle = plan->twisted1_handle[z];
 		task->buffers[1].handle = plan->twisted1_handle[z];
 		task->buffers[1].mode = STARPU_W;
 		task->buffers[1].mode = STARPU_W;
@@ -498,14 +570,14 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 		task->detach = 1;
 		task->detach = 1;
 		task->destroy = 0;
 		task->destroy = 0;
 
 
-		/* Tell that to be done with first step we need to have
-		 * finished this fft1 */
+		/* Tell that the join task will depend on the fft1 task. */
 		starpu_tag_declare_deps(STEP_TAG_1D(plan, JOIN, 0),
 		starpu_tag_declare_deps(STEP_TAG_1D(plan, JOIN, 0),
 				1, STEP_TAG(FFT1));
 				1, STEP_TAG(FFT1));
 #undef STEP_TAG
 #undef STEP_TAG
 	}
 	}
 
 
-	/* Create join task */
+	/* Create the join task, only serving as a dependency point between
+	 * fft1 and twist2 tasks */
 	plan->join_task = task = starpu_task_create();
 	plan->join_task = task = starpu_task_create();
 	task->cl = NULL;
 	task->cl = NULL;
 	task->tag_id = STEP_TAG_1D(plan, JOIN, 0);
 	task->tag_id = STEP_TAG_1D(plan, JOIN, 0);
@@ -513,7 +585,8 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 	task->detach = 1;
 	task->detach = 1;
 	task->destroy = 0;
 	task->destroy = 0;
 
 
-	/* Create second-round tasks */
+	/* Create second-round tasks: DIV_1D batches of n2/DIV_1D twist2, fft2,
+	 * and twist3 */
 	for (z = 0; z < plan->totsize3; z++) {
 	for (z = 0; z < plan->totsize3; z++) {
 		int jj = z;
 		int jj = z;
 #define STEP_TAG(step)	STEP_TAG_1D(plan, step, jj)
 #define STEP_TAG(step)	STEP_TAG_1D(plan, step, jj)
@@ -521,15 +594,16 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 		plan->fft2_args[z].plan = plan;
 		plan->fft2_args[z].plan = plan;
 		plan->fft2_args[z].jj = jj;
 		plan->fft2_args[z].jj = jj;
 
 
-		/* Register n3 (n1) chunks */
+		/* Register n3 twisted2 buffers of size n1 */
 		starpu_vector_data_register(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
 		starpu_vector_data_register(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
 		starpu_vector_data_register(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
 		starpu_vector_data_register(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
 
 
-		/* We'll need it on the CPU for the last twist anyway */
+		/* We'll need the result of fft2 on the CPU for the third
+		 * twist anyway, so tell starpu to not keep the fft2 buffer in
+		 * the GPU. */
 		starpu_data_set_wb_mask(plan->fft2_handle[z], 1<<0);
 		starpu_data_set_wb_mask(plan->fft2_handle[z], 1<<0);
 
 
-		/* Tell that twisted2 depends on the whole first step to be
-		 * done */
+		/* Tell that twisted2 depends on the join task */
 		starpu_tag_declare_deps(STEP_TAG(TWIST2),
 		starpu_tag_declare_deps(STEP_TAG(TWIST2),
 				1, STEP_TAG_1D(plan, JOIN, 0));
 				1, STEP_TAG_1D(plan, JOIN, 0));
 
 
@@ -566,6 +640,8 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 				1, STEP_TAG(FFT2));
 				1, STEP_TAG(FFT2));
 
 
 		/* Create twist3 tasks */
 		/* Create twist3 tasks */
+		/* These run only on CPUs and thus write directly into the
+		 * application output buffer. */
 		plan->twist3_tasks[z] = task = starpu_task_create();
 		plan->twist3_tasks[z] = task = starpu_task_create();
 		task->cl = &STARPUFFT(twist3_1d_codelet);
 		task->cl = &STARPUFFT(twist3_1d_codelet);
 		task->buffers[0].handle = plan->fft2_handle[z];
 		task->buffers[0].handle = plan->fft2_handle[z];
@@ -576,13 +652,14 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 		task->detach = 1;
 		task->detach = 1;
 		task->destroy = 0;
 		task->destroy = 0;
 
 
-		/* Tell that to be completely finished we need to have finished this twisted3 */
+		/* Tell that to be completely finished we need to have finished
+		 * this twisted3 */
 		starpu_tag_declare_deps(STEP_TAG_1D(plan, END, 0),
 		starpu_tag_declare_deps(STEP_TAG_1D(plan, END, 0),
 				1, STEP_TAG(TWIST3));
 				1, STEP_TAG(TWIST3));
 #undef STEP_TAG
 #undef STEP_TAG
 	}
 	}
 
 
-	/* Create end task */
+	/* Create end task, only serving as a join point. */
 	plan->end_task = task = starpu_task_create();
 	plan->end_task = task = starpu_task_create();
 	task->cl = NULL;
 	task->cl = NULL;
 	task->tag_id = STEP_TAG_1D(plan, END, 0);
 	task->tag_id = STEP_TAG_1D(plan, END, 0);
@@ -593,6 +670,7 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 	return plan;
 	return plan;
 }
 }
 
 
+/* Actually submit all the tasks. */
 static starpu_tag_t
 static starpu_tag_t
 STARPUFFT(start1dC2C)(STARPUFFT(plan) plan)
 STARPUFFT(start1dC2C)(STARPUFFT(plan) plan)
 {
 {
@@ -617,6 +695,7 @@ STARPUFFT(start1dC2C)(STARPUFFT(plan) plan)
 	return STEP_TAG_1D(plan, END, 0);
 	return STEP_TAG_1D(plan, END, 0);
 }
 }
 
 
+/* Free all the tags. The generic code handles freeing the buffers. */
 static void
 static void
 STARPUFFT(free_1d_tags)(STARPUFFT(plan) plan)
 STARPUFFT(free_1d_tags)(STARPUFFT(plan) plan)
 {
 {