#include<stdio.h>
#include <starpu.h>
#include <math.h>

__global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n)
{
	uint tid = threadIdx.x;
	uint index = blockIdx.x*blockDim.x + threadIdx.x;
	
	// convert global data pointer to the local pointer of the block
	uint32_t *idata = window + blockIdx.x * blockDim.x;
	
	if (index >= n) return;
	
	// reduction algorithm	
	for (int stride = blockDim.x/2; stride > 0; stride>>=1){
		if (tid < stride){
			idata[tid] += idata[tid+stride];
		}
		__syncthreads();
	}
	
	// write result for this block to global memory
	if(tid == 0){
		dev_data[blockIdx.x] = idata[0];
	}
	
}


extern "C" void output_thread_aggregation(void *buffers[], void *_args)
{
	printf("cuda function\n");
	
	uint64_t *aggregated_value = (uint64_t *)_args;
	
	/* length of the vector */
	int n = STARPU_VECTOR_GET_NX(buffers[0]);
	uint32_t *window = (uint32_t *)STARPU_VECTOR_GET_PTR(buffers[0]);
	
	/* define the number of threads per block accordingly to the vector's size */
	int n_threads;
	if (sqrt(n) <= 32)
		{n_threads = 32;}
	else if (sqrt(n) <= 64)
		{n_threads = 64;}
	else if (sqrt(n) <= 128)
		{n_threads = 128;}
	else if (sqrt(n) <= 256)
		{n_threads = 256;}
	else if (sqrt(n) <= 512)
		{n_threads = 512;}
	else
		{n_threads = 1024;}

	printf("n_threads = %lu\n",n_threads);
	
	// define number of blocks and number of threads per block (kernel parameters)
	dim3 threads_per_block (n_threads);
	dim3 blocks ((n+threads_per_block.x-1)/threads_per_block.x);
	
	// dynamic allocation of the reduced data matrix
	uint32_t *h_data = (uint32_t *)malloc(blocks.x*sizeof(uint32_t));
	
	if(!h_data){
		printf("Allocation error for h_data - aborting.\n");
		exit(1);
	}
	
	// GPU memory pointers
	uint32_t *dev_window;
	uint32_t *dev_data;
	
	// allocate the memory on the GPU
	cudaMalloc((void**)&dev_window, n*sizeof(uint32_t));
	cudaMalloc((void**)&dev_data, blocks.x*sizeof(uint32_t));
	
	// copy the array 'window' to the GPU
	cudaMemcpyAsync(dev_window, window, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
	
	// launch kernel
	output_thread_aggregation<<<blocks,threads_per_block>>>(dev_window, dev_data, n);
	
	cudaStreamSynchronize(starpu_cuda_get_local_stream());
	
	// copy back the result to the CPU
	cudaMemcpyAsync(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost);
	
	uint64_t gpu_sum = 0;
	
	// compute the total sum from gpu
	for(int i=0; i<blocks.x; i++){
		gpu_sum += h_data[i];
	}
	
	// copy the sum to the main program
	*aggregated_value = gpu_sum;
	
	//free the memory allocated on the GPU
	cudaFree(dev_window);
	cudaFree(dev_data);
}