#include <stdio.h>
#include<stdlib.h>
#include <sys/time.h>
// platform independent data types:
#include <stdint.h>

// Number of iterations
#define NUM_ITERATIONS          10
// The window height is equal to the number of streams
#define NUM_INPUT_STREAMS       WINDOW_HEIGHT
#define WINDOW_HEIGHT           4
// The window width is equal to number of tuples required to fill in a window
#define WINDOW_WIDTH            100

// The total number of elements is equal to the window_height times window_width
#define elements				WINDOW_HEIGHT*WINDOW_WIDTH

// measure time: 
struct timeval start, end;


// output streams (aggregator):
__global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n) {
	
	uint tid = threadIdx.x;
	uint index = blockIdx.x*blockDim.x + threadIdx.x;
	
	uint32_t *idata = window + blockIdx.x * blockDim.x;
	
	if (index >= n) return;
	
	for (int stride = blockDim.x/2; stride > 0; stride>>=1){
		if (tid < stride){
			idata[tid] += idata[tid+stride];
		}
		__syncthreads();
	}
	
	if(tid == 0){
		dev_data[blockIdx.x] = idata[0];
	}

}

int main(void) {

	uint32_t iterations_id;
	uint64_t aggregated_value;
	
	int n = elements;
	
	//define number of blocks and number of threads per block (kernel parameters)
	dim3 threads_per_block (1);
	dim3 blocks ((elements+threads_per_block.x-1)/threads_per_block.x);
	
	gettimeofday(&start, NULL);

	// create input streams:
	for (iterations_id=0; iterations_id<NUM_ITERATIONS; iterations_id++) {

		// dynamic allocation of the memory needed for all the elements
		uint32_t *window;
		window = (uint32_t*)calloc(elements, sizeof(uint32_t));
		
		// check if there's enough space for the allocation
		if(!window){
			printf("Allocation error for window - aborting.\n");
			exit(1);
		}
		
		// dynamic allocation of the reduced data matrix
		uint32_t *h_data = (uint32_t *)malloc(blocks.x*sizeof(uint32_t));
		
		if(!h_data){
			printf("Allocation error for h_data - aborting.\n");
			exit(1);
		}
		
		uint64_t ag_val = 0;
		
		// initialization - fill in the window with random numbers:
		for (int i = 0; i < elements; i++) {
			window[i] = (rand()%1000);
			ag_val += window[i];
		}
		printf("TEST %d\n", ag_val);
		
		//GPU memory pointers
		uint32_t *dev_window;
		uint32_t *dev_data;

		//allocate the memory on the GPU
		cudaMalloc((void**)&dev_window, elements*sizeof(uint32_t));
		cudaMalloc((void**)&dev_data, blocks.x*sizeof(uint32_t));

		//copy the array 'window' to the GPU
		cudaMemcpy(dev_window, window, elements*sizeof(uint32_t), cudaMemcpyHostToDevice);

		//launch kernel
		output_thread_aggregation<<<blocks,threads_per_block>>>(dev_window, dev_data, n);
		cudaDeviceSynchronize();

		//copy back the result to the CPU
		cudaMemcpy(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost);
		
		aggregated_value = 0;
		
		for(int i=0; i<blocks.x; i++){
			aggregated_value += h_data[i];
		}
		
		cudaDeviceSynchronize();

		printf("iter: %d - aggregated value: %lu\n", iterations_id, aggregated_value);
		
		//free the memory allocated on the GPU
		cudaFree(dev_window);
		cudaFree(dev_data);

	}

	gettimeofday(&end, NULL);
	printf("usec: %ld\n", ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)));

	return 0;
	
}