| 
					
				 | 
			
			
				@@ -202,20 +202,16 @@ STARPUFFT(fft1_1d_kernel_cpu)(void *descr[], void *_args) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	task_per_worker[workerid]++; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	const STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	STARPUFFT(complex) * restrict fft1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_fftw_complex * restrict worker_in1 = (STARPUFFT(complex) *)plan->plans[workerid].in1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_fftw_complex * restrict worker_out1 = (STARPUFFT(complex) *)plan->plans[workerid].out1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	/* printf("fft1 %d %g\n", i, (double) cabs(twisted1[0])); */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	memcpy(worker_in1, twisted1, plan->totsize2 * sizeof(*worker_in1)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_FFTW(execute)(plan->plans[workerid].plan1_cpu); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	_FFTW(execute_dft)(plan->plans[workerid].plan1_cpu, twisted1, fft1); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	/* twiddle while copying from fftw output buffer to fft1 buffer */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	/* twiddle fft1 buffer */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	for (j = 0; j < n2; j++) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fft1[j] = worker_out1[j] * plan->roots[0][i*j]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		fft1[j] = fft1[j] * plan->roots[0][i*j]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -260,18 +256,12 @@ STARPUFFT(fft2_1d_kernel_cpu)(void *descr[], void *_args) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	task_per_worker[workerid]++; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	const STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	/* printf("fft2 %d %g\n", jj, (double) cabs(twisted2[plan->totsize4-1])); */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_fftw_complex * restrict worker_in2 = (STARPUFFT(complex) *)plan->plans[workerid].in2; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_fftw_complex * restrict worker_out2 = (STARPUFFT(complex) *)plan->plans[workerid].out2; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	memcpy(worker_in2, twisted2, plan->totsize4 * sizeof(*worker_in2)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_FFTW(execute)(plan->plans[workerid].plan2_cpu); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	/* no twiddle */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	memcpy(fft2, worker_out2, plan->totsize4 * sizeof(*worker_out2)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	_FFTW(execute_dft)(plan->plans[workerid].plan2_cpu, twisted2, fft2); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -473,22 +463,14 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 			/* first fft plan: one fft of size n2. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 			 * FFTW imposes that buffer pointers are known at 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 			 * planning time. */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			plan->plans[workerid].in1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			memset(plan->plans[workerid].in1, 0, plan->totsize2 * sizeof(_fftw_complex)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			plan->plans[workerid].out1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			memset(plan->plans[workerid].out1, 0, plan->totsize2 * sizeof(_fftw_complex)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_1d)(n2, plan->plans[workerid].in1, plan->plans[workerid].out1, sign, _FFTW_FLAGS); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_1d)(n2, NULL, NULL, sign, _FFTW_FLAGS); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 			STARPU_ASSERT(plan->plans[workerid].plan1_cpu); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 			/* second fft plan: n3 ffts of size n1 */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			plan->plans[workerid].in2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			memset(plan->plans[workerid].in2, 0, plan->totsize4 * sizeof(_fftw_complex)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			plan->plans[workerid].out2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			memset(plan->plans[workerid].out2, 0, plan->totsize4 * sizeof(_fftw_complex)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 					plan->n1, n3, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-					/* input */ plan->plans[workerid].in2, NULL, 1, plan->totsize1, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-					/* output */ plan->plans[workerid].out2, NULL, 1, plan->totsize1, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+					NULL, NULL, 1, plan->totsize1, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+					NULL, NULL, 1, plan->totsize1, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 					sign, _FFTW_FLAGS); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 			STARPU_ASSERT(plan->plans[workerid].plan2_cpu); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #else 
			 |