浏览代码

Rework the interface: return a task instead of a tag

Samuel Thibault 13 年之前
父节点
当前提交
45e2dc0ac8

+ 5 - 2
examples/starpufft/starpufft.h

@@ -40,7 +40,10 @@ void *starpufft(malloc)(size_t n); \
 void starpufft(free)(void *p); \
 \
 void starpufft(execute)(starpufft(plan) p, void *in, void *out); \
-starpu_tag_t starpufft(start)(starpufft(plan) p, void *in, void *out); \
+struct starpu_task *starpufft(start)(starpufft(plan) p, void *in, void *out); \
+\
+void starpufft(execute_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
+struct starpu_task *starpufft(start_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
 \
 void starpufft(destroy_plan)(starpufft(plan) p); \
 \
@@ -52,5 +55,5 @@ __STARPUFFT_INTERFACE(__STARPUFFT, double)
 __STARPUFFT_INTERFACE(__STARPUFFTF, float)
 __STARPUFFT_INTERFACE(__STARPUFFTL, long double)
 
+/* Internal use */
 extern int starpufft_last_plan_number;
-extern int starpufft_last_tag;

+ 0 - 3
examples/starpufft/starpufft_common.c

@@ -19,6 +19,3 @@
 
 /* Used as an identifier in starpu tags to let plans run concurrently */
 int starpufft_last_plan_number;
-
-/* Used as an identifier in starpu tags to let sequential executions run concurrently */
-int starpufft_last_tag;

+ 30 - 12
examples/starpufft/starpufftx.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -33,6 +33,7 @@
 
 #define _FFTW_FLAGS FFTW_ESTIMATE
 
+/* Steps for the parallel variant */
 enum steps {
 	SPECIAL, TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END
 };
@@ -42,6 +43,7 @@ enum steps {
 #define STEP_BITS 3
 #define STEP_SHIFT (NUMBER_SHIFT - STEP_BITS)
 
+/* Tags for the steps of the parallel variant */
 #define _STEP_TAG(plan, step, i) (((starpu_tag_t) plan->number << NUMBER_SHIFT) | ((starpu_tag_t)(step) << STEP_SHIFT) | (starpu_tag_t) (i))
 
 
@@ -106,8 +108,6 @@ struct STARPUFFT(plan) {
 	/* Tasks */
 	struct starpu_task **twist1_tasks, **fft1_tasks, **twist2_tasks, **fft2_tasks, **twist3_tasks;
 	struct starpu_task *join_task, *end_task;
-	/* Sequential version */
-	struct starpu_task *fft_task;
 
 	/* Arguments for tasks */
 	struct STARPUFFT(args) *fft1_args, *fft2_args;
@@ -161,10 +161,10 @@ compute_roots(STARPUFFT(plan) plan)
 #include "starpufftx1d.c"
 #include "starpufftx2d.c"
 
-starpu_tag_t
+struct starpu_task *
 STARPUFFT(start)(STARPUFFT(plan) plan, void *_in, void *_out)
 {
-	starpu_tag_t tag;
+	struct starpu_task *task;
 	int z;
 
 	plan->in = _in;
@@ -181,7 +181,7 @@ if (PARALLEL) {
 				for (z = 0; z < plan->totsize1; z++)
 					plan->twist1_tasks[z]->buffers[0].handle = plan->in_handle;
 }
-				tag = STARPUFFT(start1dC2C)(plan);
+				task = STARPUFFT(start1dC2C)(plan, plan->in_handle, plan->out_handle);
 				break;
 			default:
 				STARPU_ABORT();
@@ -197,19 +197,30 @@ if (PARALLEL) {
 			for (z = 0; z < plan->totsize1; z++)
 				plan->twist1_tasks[z]->buffers[0].handle = plan->in_handle;
 }
-			tag = STARPUFFT(start2dC2C)(plan);
+			task = STARPUFFT(start2dC2C)(plan, plan->in_handle, plan->out_handle);
 			break;
 		default:
 			STARPU_ABORT();
 			break;
 	}
-	return tag;
+	return task;
 }
 
 void
 STARPUFFT(cleanup)(STARPUFFT(plan) plan)
 {
-	starpu_data_unregister(plan->in_handle);
+	if (plan->in_handle)
+		starpu_data_unregister(plan->in_handle);
+if (!PARALLEL) {
+	if (plan->out_handle)
+		starpu_data_unregister(plan->out_handle);
+}
+}
+
+struct starpu_task *
+STARPUFFT(start_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	return STARPUFFT(start1dC2C)(plan, in, out);
 }
 
 void
@@ -220,15 +231,22 @@ STARPUFFT(execute)(STARPUFFT(plan) plan, void *in, void *out)
 
 	gettimeofday(&start, NULL);
 
-	starpu_tag_t tag = STARPUFFT(start)(plan, in, out);
+	struct starpu_task *task = STARPUFFT(start)(plan, in, out);
 	gettimeofday(&submit_tasks, NULL);
-	starpu_tag_wait(tag);
+	starpu_task_wait(task);
 
 	STARPUFFT(cleanup)(plan);
 
 	gettimeofday(&end, NULL);
 }
 
+void
+STARPUFFT(execute_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	struct starpu_task *task = STARPUFFT(start_handle)(plan, in, out);
+	starpu_task_wait(task);
+}
+
 /* Destroy FFTW plans, unregister and free buffers, and free tags */
 void
 STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
@@ -253,7 +271,7 @@ if (PARALLEL) {
 #endif
 			break;
 		default:
-			STARPU_ABORT();
+			/* Do not care, we won't be executing anything there. */
 			break;
 		}
 	}

+ 11 - 11
examples/starpufft/starpufftx1d.c

@@ -579,7 +579,7 @@ if (PARALLEL) {
 		case STARPU_CUDA_WORKER:
 			break;
 		default:
-			STARPU_ABORT();
+			/* Do not care, we won't be executing anything there. */
 			break;
 		}
 	}
@@ -625,6 +625,8 @@ if (PARALLEL) {
 		int i = z;
 #define STEP_TAG(step)	STEP_TAG_1D(plan, step, i)
 
+		/* TODO: get rid of tags */
+
 		plan->fft1_args[z].plan = plan;
 		plan->fft1_args[z].i = i;
 
@@ -767,8 +769,8 @@ if (PARALLEL) {
 }
 
 /* Actually submit all the tasks. */
-static starpu_tag_t
-STARPUFFT(start1dC2C)(STARPUFFT(plan) plan)
+static struct starpu_task *
+STARPUFFT(start1dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
 {
 	STARPU_ASSERT(plan->type == C2C);
 	int z;
@@ -789,23 +791,21 @@ if (PARALLEL) {
 
 	starpu_task_submit(plan->end_task);
 
-	return STEP_TAG_1D(plan, END, 0);
+	return plan->end_task;
 } else /* !PARALLEL */ {
 	struct starpu_task *task;
 
 	/* Create FFT task */
-	plan->fft_task = task = starpu_task_create();
+	task = starpu_task_create();
 	task->cl = &STARPUFFT(fft_1d_codelet);
-	task->buffers[0].handle = plan->in_handle;
+	task->buffers[0].handle = in;
 	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = plan->out_handle;
+	task->buffers[1].handle = out;
 	task->buffers[1].mode = STARPU_W;
 	task->cl_arg = plan;
-	task->tag_id = STARPU_ATOMIC_ADD(&starpufft_last_tag, 1);
-	task->use_tag = 1;
 
-	starpu_task_submit(plan->fft_task);
-	return task->tag_id;
+	starpu_task_submit(task);
+	return task;
 }
 }
 

+ 22 - 20
examples/starpufft/starpufftx2d.c

@@ -387,7 +387,7 @@ STARPUFFT(fft_2d_plan_gpu)(void *args)
 	int m = plan->n[1];
 	int workerid = starpu_worker_get_id();
 
-	cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n, m, _CUFFT_C2C);
+	cures = cufftPlan2d(&plan->plans[workerid].plan_cuda, n, m, _CUFFT_C2C);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 	cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
@@ -534,7 +534,11 @@ if (PARALLEL) {
 	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
 	plan->n2[0] = n2;
 	plan->n2[1] = m2;
+}
+
 	plan->totsize = n * m;
+
+if (PARALLEL) {
 	plan->totsize1 = n1 * m1;
 	plan->totsize2 = n2 * m2;
 	plan->totsize3 = DIV_2D_N * DIV_2D_M;
@@ -577,7 +581,7 @@ if (PARALLEL) {
 		case STARPU_CUDA_WORKER:
 			break;
 		default:
-			STARPU_ABORT();
+			/* Do not care, we won't be executing anything there. */
 			break;
 		}
 	}
@@ -591,6 +595,7 @@ if (PARALLEL) {
 #endif
 
 if (PARALLEL) {
+	/* Allocate buffers. */
 	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
 	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
 	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
@@ -600,17 +605,20 @@ if (PARALLEL) {
 	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
 	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
 
+	/* Allocate handle arrays */
 	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
 	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
 	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
 	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
 
+	/* Allocate task arrays */
 	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
 	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
 	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
 	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
 	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
 
+	/* Allocate codelet argument arrays */
 	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
 	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
 
@@ -619,6 +627,8 @@ if (PARALLEL) {
 		int i = z / m1, j = z % m1;
 #define STEP_TAG(step)	STEP_TAG_2D(plan, step, i, j)
 
+		/* TODO: get rid of tags */
+
 		plan->fft1_args[z].plan = plan;
 		plan->fft1_args[z].i = i;
 		plan->fft1_args[z].j = j;
@@ -640,7 +650,6 @@ if (PARALLEL) {
 		task->cl_arg = &plan->fft1_args[z];
 		task->tag_id = STEP_TAG(TWIST1);
 		task->use_tag = 1;
-		task->detach = 1;
 		task->destroy = 0;
 
 		/* Tell that fft1 depends on twisted1 */
@@ -661,7 +670,6 @@ if (PARALLEL) {
 		task->cl_arg = &plan->fft1_args[z];
 		task->tag_id = STEP_TAG(FFT1);
 		task->use_tag = 1;
-		task->detach = 1;
 		task->destroy = 0;
 
 		/* Tell that to be done with first step we need to have
@@ -676,7 +684,6 @@ if (PARALLEL) {
 	task->cl = NULL;
 	task->tag_id = STEP_TAG_2D(plan, JOIN, 0, 0);
 	task->use_tag = 1;
-	task->detach = 1;
 	task->destroy = 0;
 
 	/* Create second-round tasks */
@@ -708,7 +715,6 @@ if (PARALLEL) {
 		task->cl_arg = &plan->fft2_args[z];
 		task->tag_id = STEP_TAG(TWIST2);
 		task->use_tag = 1;
-		task->detach = 1;
 		task->destroy = 0;
 
 		/* Tell that fft2 depends on twisted2 */
@@ -725,7 +731,6 @@ if (PARALLEL) {
 		task->cl_arg = &plan->fft2_args[z];
 		task->tag_id = STEP_TAG(FFT2);
 		task->use_tag = 1;
-		task->detach = 1;
 		task->destroy = 0;
 
 		/* Tell that twist3 depends on fft2 */
@@ -733,6 +738,8 @@ if (PARALLEL) {
 				1, STEP_TAG(FFT2));
 
 		/* Create twist3 tasks */
+		/* These run only on CPUs and thus write directly into the
+		 * application output buffer. */
 		plan->twist3_tasks[z] = task = starpu_task_create();
 		task->cl = &STARPUFFT(twist3_2d_codelet);
 		task->buffers[0].handle = plan->fft2_handle[z];
@@ -740,7 +747,6 @@ if (PARALLEL) {
 		task->cl_arg = &plan->fft2_args[z];
 		task->tag_id = STEP_TAG(TWIST3);
 		task->use_tag = 1;
-		task->detach = 1;
 		task->destroy = 0;
 
 		/* Tell that to be completely finished we need to have finished this twisted3 */
@@ -754,7 +760,6 @@ if (PARALLEL) {
 	task->cl = NULL;
 	task->tag_id = STEP_TAG_2D(plan, END, 0, 0);
 	task->use_tag = 1;
-	task->detach = 1;
 	task->destroy = 0;
 
 }
@@ -763,8 +768,8 @@ if (PARALLEL) {
 }
 
 /* Actually submit all the tasks. */
-static starpu_tag_t
-STARPUFFT(start2dC2C)(STARPUFFT(plan) plan)
+static struct starpu_task *
+STARPUFFT(start2dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
 {
 	STARPU_ASSERT(plan->type == C2C);
 	int z;
@@ -785,24 +790,21 @@ if (PARALLEL) {
 
 	starpu_task_submit(plan->end_task);
 
-	return STEP_TAG_2D(plan, END, 0, 0);
+	return plan->end_task;
 } else /* !PARALLEL */ {
 	struct starpu_task *task;
 
-	/* FIXME: rather return the task? */
 	/* Create FFT task */
-	plan->fft_task = task = starpu_task_create();
+	task = starpu_task_create();
 	task->cl = &STARPUFFT(fft_2d_codelet);
-	task->buffers[0].handle = plan->in_handle;
+	task->buffers[0].handle = in;
 	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = plan->out_handle;
+	task->buffers[1].handle = out;
 	task->buffers[1].mode = STARPU_W;
 	task->cl_arg = plan;
-	task->tag_id = STARPU_ATOMIC_ADD(&starpufft_last_tag, 1);
-	task->use_tag = 1;
 
-	starpu_task_submit(plan->fft_task);
-	return task->tag_id;
+	starpu_task_submit(task);
+	return task;
 }
 }
 

+ 90 - 60
examples/starpufft/testx.c

@@ -39,6 +39,66 @@
 #define SIGN (-1)
 /* #define SIGN (1) */
 
+#ifdef STARPU_HAVE_FFTW
+static void check_fftw(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, int size)
+{
+	int i;
+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
+	for (i = 0; i < size; i++) {
+		double diff = cabs(out[i]-out_fftw[i]);
+		double diff2 = diff * diff;
+		double size = cabs(out_fftw[i]);
+		double size2 = size * size;
+		if (diff > max)
+			max = diff;
+		tot += diff;
+		normdiff += diff2;
+		norm += size2;
+	}
+	fprintf(stderr, "\nmaximum difference %g\n", max);
+	fprintf(stderr, "average difference %g\n", tot / size);
+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
+	double relmaxdiff = max / sqrt(norm);
+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
+	double relavgdiff = (tot / size) / sqrt(norm);
+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
+		exit(EXIT_FAILURE);
+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
+		exit(EXIT_FAILURE);
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+static void check_cuda(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, int size)
+{
+	int i;
+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
+	for (i = 0; i < size; i++) {
+		double diff = cabs(out_cuda[i]-out_fftw[i]);
+		double diff2 = diff * diff;
+		double size = cabs(out_fftw[i]);
+		double size2 = size * size;
+		if (diff > max)
+			max = diff;
+		tot += diff;
+		normdiff += diff2;
+		norm += size2;
+	}
+	fprintf(stderr, "\nmaximum difference %g\n", max);
+	fprintf(stderr, "average difference %g\n", tot / size);
+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
+	double relmaxdiff = max / sqrt(norm);
+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
+	double relavgdiff = (tot / size) / sqrt(norm);
+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
+		exit(EXIT_FAILURE);
+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
+		exit(EXIT_FAILURE);
+}
+#endif
+
 int main(int argc, char *argv[]) {
 	int i;
 	struct timeval begin, end;
@@ -46,6 +106,7 @@ int main(int argc, char *argv[]) {
 	size_t bytes;
 	int n = 0, m = 0;
 	STARPUFFT(plan) plan;
+	starpu_data_handle_t in_handle, out_handle;
 #ifdef STARPU_HAVE_FFTW
 	_FFTW(plan) fftw_plan;
 #endif
@@ -91,13 +152,13 @@ int main(int argc, char *argv[]) {
 #endif
 
 #ifdef STARPU_USE_CUDA
-	STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda));
+	STARPUFFT(complex) *out_cuda = STARPUFFT(malloc)(size * sizeof(*out_cuda));
 #endif
 
 	if (argc == 2) {
 		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
 #ifdef STARPU_HAVE_FFTW
-		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
+		fftw_plan = _FFTW(plan_dft_1d)(n, NULL, NULL, SIGN, FFTW_ESTIMATE);
 #endif
 #ifdef STARPU_USE_CUDA
 		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
@@ -107,7 +168,7 @@ int main(int argc, char *argv[]) {
 	} else if (argc == 3) {
 		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
 #ifdef STARPU_HAVE_FFTW
-		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
+		fftw_plan = _FFTW(plan_dft_2d)(n, m, NULL, NULL, SIGN, FFTW_ESTIMATE);
 #endif
 #ifdef STARPU_USE_CUDA
 		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
@@ -118,7 +179,7 @@ int main(int argc, char *argv[]) {
 
 #ifdef STARPU_HAVE_FFTW
 	gettimeofday(&begin, NULL);
-	_FFTW(execute)(fftw_plan);
+	_FFTW(execute_dft)(fftw_plan, in, out_fftw);
 	gettimeofday(&end, NULL);
 	_FFTW(destroy_plan)(fftw_plan);
 	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
@@ -137,6 +198,31 @@ int main(int argc, char *argv[]) {
 #endif
 
 	STARPUFFT(execute)(plan, in, out);
+	STARPUFFT(showstats)(stdout);
+
+#ifdef STARPU_HAVE_FFTW
+	check_fftw(out, out_fftw, size);
+#endif
+#ifdef STARPU_USE_CUDA
+	check_cuda(out, out_cuda, size);
+#endif
+
+#if 1
+	starpu_vector_data_register(&in_handle, 0, (uintptr_t) in, size, sizeof(*in));
+	starpu_vector_data_register(&out_handle, 0, (uintptr_t) out, size, sizeof(*out));
+
+	STARPUFFT(execute_handle)(plan, in_handle, out_handle);
+
+	starpu_data_unregister(in_handle);
+	starpu_data_unregister(out_handle);
+
+#ifdef STARPU_HAVE_FFTW
+	check_fftw(out, out_fftw, size);
+#endif
+#ifdef STARPU_USE_CUDA
+	check_cuda(out, out_cuda, size);
+#endif
+#endif
 
 	STARPUFFT(showstats)(stdout);
 	STARPUFFT(destroy_plan)(plan);
@@ -156,62 +242,6 @@ int main(int argc, char *argv[]) {
 #endif
 #endif
 
-#ifdef STARPU_HAVE_FFTW
-{
-	double max = 0., tot = 0., norm = 0., normdiff = 0.;
-	for (i = 0; i < size; i++) {
-		double diff = cabs(out[i]-out_fftw[i]);
-		double diff2 = diff * diff;
-		double size = cabs(out_fftw[i]);
-		double size2 = size * size;
-		if (diff > max)
-			max = diff;
-		tot += diff;
-		normdiff += diff2;
-		norm += size2;
-	}
-	fprintf(stderr, "\nmaximum difference %g\n", max);
-	fprintf(stderr, "average difference %g\n", tot / size);
-	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
-	double relmaxdiff = max / sqrt(norm);
-	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
-	double relavgdiff = (tot / size) / sqrt(norm);
-	fprintf(stderr, "relative average difference %g\n", relavgdiff);
-	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
-		return EXIT_FAILURE;
-	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
-		return EXIT_FAILURE;
-}
-#endif
-
-#ifdef STARPU_USE_CUDA
-{
-	double max = 0., tot = 0., norm = 0., normdiff = 0.;
-	for (i = 0; i < size; i++) {
-		double diff = cabs(out_cuda[i]-out_fftw[i]);
-		double diff2 = diff * diff;
-		double size = cabs(out_fftw[i]);
-		double size2 = size * size;
-		if (diff > max)
-			max = diff;
-		tot += diff;
-		normdiff += diff2;
-		norm += size2;
-	}
-	fprintf(stderr, "\nmaximum difference %g\n", max);
-	fprintf(stderr, "average difference %g\n", tot / size);
-	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
-	double relmaxdiff = max / sqrt(norm);
-	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
-	double relavgdiff = (tot / size) / sqrt(norm);
-	fprintf(stderr, "relative average difference %g\n", relavgdiff);
-	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
-		return EXIT_FAILURE;
-	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
-		return EXIT_FAILURE;
-}
-#endif
-
 	STARPUFFT(free)(in);
 	STARPUFFT(free)(out);