|
@@ -0,0 +1,128 @@
|
|
|
|
+#include <stdio.h>
|
|
|
|
+#include<stdlib.h>
|
|
|
|
+#include <sys/time.h>
|
|
|
|
+// platform independent data types:
|
|
|
|
+#include <stdint.h>
|
|
|
|
+
|
|
|
|
+// Number of iterations
|
|
|
|
+#define NUM_ITERATIONS 10
|
|
|
|
+// The window height is equal to the number of streams
|
|
|
|
+#define NUM_INPUT_STREAMS WINDOW_HEIGHT
|
|
|
|
+#define WINDOW_HEIGHT 4
|
|
|
|
+// The window width is equal to number of tuples required to fill in a window
|
|
|
|
+#define WINDOW_WIDTH 100
|
|
|
|
+
|
|
|
|
+// The total number of elements is equal to the window_height times window_width
|
|
|
|
+#define elements WINDOW_HEIGHT*WINDOW_WIDTH
|
|
|
|
+
|
|
|
|
+// measure time:
|
|
|
|
+struct timeval start, end;
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// output streams (aggregator):
|
|
|
|
+__global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n) {
|
|
|
|
+
|
|
|
|
+ uint tid = threadIdx.x;
|
|
|
|
+ uint index = blockIdx.x*blockDim.x + threadIdx.x;
|
|
|
|
+
|
|
|
|
+ uint32_t *idata = window + blockIdx.x * blockDim.x;
|
|
|
|
+
|
|
|
|
+ if (index >= n) return;
|
|
|
|
+
|
|
|
|
+ for (int stride = blockDim.x/2; stride > 0; stride>>=1){
|
|
|
|
+ if (tid < stride){
|
|
|
|
+ idata[tid] += idata[tid+stride];
|
|
|
|
+ }
|
|
|
|
+ __syncthreads();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if(tid == 0){
|
|
|
|
+ dev_data[blockIdx.x] = idata[0];
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+int main(void) {
|
|
|
|
+
|
|
|
|
+ uint32_t iterations_id;
|
|
|
|
+ uint64_t aggregated_value;
|
|
|
|
+
|
|
|
|
+ int n = elements;
|
|
|
|
+
|
|
|
|
+ //define number of blocks and number of threads per block (kernel parameters)
|
|
|
|
+ dim3 threads_per_block (1);
|
|
|
|
+ dim3 blocks ((elements+threads_per_block.x-1)/threads_per_block.x);
|
|
|
|
+
|
|
|
|
+ gettimeofday(&start, NULL);
|
|
|
|
+
|
|
|
|
+ // create input streams:
|
|
|
|
+ for (iterations_id=0; iterations_id<NUM_ITERATIONS; iterations_id++) {
|
|
|
|
+
|
|
|
|
+ // dynamic allocation of the memory needed for all the elements
|
|
|
|
+ uint32_t *window;
|
|
|
|
+ window = (uint32_t*)calloc(elements, sizeof(uint32_t));
|
|
|
|
+
|
|
|
|
+ // check if there's enough space for the allocation
|
|
|
|
+ if(!window){
|
|
|
|
+ printf("Allocation error for window - aborting.\n");
|
|
|
|
+ exit(1);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // dynamic allocation of the reduced data matrix
|
|
|
|
+ uint32_t *h_data = (uint32_t *)malloc(blocks.x*sizeof(uint32_t));
|
|
|
|
+
|
|
|
|
+ if(!h_data){
|
|
|
|
+ printf("Allocation error for h_data - aborting.\n");
|
|
|
|
+ exit(1);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ uint64_t ag_val = 0;
|
|
|
|
+
|
|
|
|
+ // initialization - fill in the window with random numbers:
|
|
|
|
+ for (int i = 0; i < elements; i++) {
|
|
|
|
+ window[i] = (rand()%1000);
|
|
|
|
+ ag_val += window[i];
|
|
|
|
+ }
|
|
|
|
+ printf("TEST %d\n", ag_val);
|
|
|
|
+
|
|
|
|
+ //GPU memory pointers
|
|
|
|
+ uint32_t *dev_window;
|
|
|
|
+ uint32_t *dev_data;
|
|
|
|
+
|
|
|
|
+ //allocate the memory on the GPU
|
|
|
|
+ cudaMalloc((void**)&dev_window, elements*sizeof(uint32_t));
|
|
|
|
+ cudaMalloc((void**)&dev_data, blocks.x*sizeof(uint32_t));
|
|
|
|
+
|
|
|
|
+ //copy the array 'window' to the GPU
|
|
|
|
+ cudaMemcpy(dev_window, window, elements*sizeof(uint32_t), cudaMemcpyHostToDevice);
|
|
|
|
+
|
|
|
|
+ //launch kernel
|
|
|
|
+ output_thread_aggregation<<<blocks,threads_per_block>>>(dev_window, dev_data, n);
|
|
|
|
+ cudaDeviceSynchronize();
|
|
|
|
+
|
|
|
|
+ //copy back the result to the CPU
|
|
|
|
+ cudaMemcpy(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
|
|
|
+
|
|
|
|
+ aggregated_value = 0;
|
|
|
|
+
|
|
|
|
+ for(int i=0; i<blocks.x; i++){
|
|
|
|
+ aggregated_value += h_data[i];
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ cudaDeviceSynchronize();
|
|
|
|
+
|
|
|
|
+ printf("iter: %d - aggregated value: %lu\n", iterations_id, aggregated_value);
|
|
|
|
+
|
|
|
|
+ //free the memory allocated on the GPU
|
|
|
|
+ cudaFree(dev_window);
|
|
|
|
+ cudaFree(dev_data);
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ gettimeofday(&end, NULL);
|
|
|
|
+ printf("usec: %ld\n", ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)));
|
|
|
|
+
|
|
|
|
+ return 0;
|
|
|
|
+
|
|
|
|
+}
|
|
|
|
+
|