#include #include #include // platform independent data types: #include // Number of iterations #define NUM_ITERATIONS 10 // The window height is equal to the number of streams #define NUM_INPUT_STREAMS WINDOW_HEIGHT #define WINDOW_HEIGHT 4 // The window width is equal to number of tuples required to fill in a window #define WINDOW_WIDTH 100 // The total number of elements is equal to the window_height times window_width #define elements WINDOW_HEIGHT*WINDOW_WIDTH // measure time: struct timeval start, end; // output streams (aggregator): __global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n) { uint tid = threadIdx.x; uint index = blockIdx.x*blockDim.x + threadIdx.x; uint32_t *idata = window + blockIdx.x * blockDim.x; if (index >= n) return; for (int stride = blockDim.x/2; stride > 0; stride>>=1){ if (tid < stride){ idata[tid] += idata[tid+stride]; } __syncthreads(); } if(tid == 0){ dev_data[blockIdx.x] = idata[0]; } } int main(void) { uint32_t iterations_id; uint64_t aggregated_value; int n = elements; //define number of blocks and number of threads per block (kernel parameters) dim3 threads_per_block (1); dim3 blocks ((elements+threads_per_block.x-1)/threads_per_block.x); gettimeofday(&start, NULL); // create input streams: for (iterations_id=0; iterations_id>>(dev_window, dev_data, n); cudaDeviceSynchronize(); //copy back the result to the CPU cudaMemcpy(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost); aggregated_value = 0; for(int i=0; i