123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- #include <stdio.h>
- #include<stdlib.h>
- #include <sys/time.h>
- // platform independent data types:
- #include <stdint.h>
- // Number of iterations
- #define NUM_ITERATIONS 10
- // The window height is equal to the number of streams
- #define NUM_INPUT_STREAMS WINDOW_HEIGHT
- #define WINDOW_HEIGHT 4
- // The window width is equal to number of tuples required to fill in a window
- #define WINDOW_WIDTH 100
- // The total number of elements is equal to the window_height times window_width
- #define elements WINDOW_HEIGHT*WINDOW_WIDTH
- // measure time:
- struct timeval start, end;
- // output streams (aggregator):
- __global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n) {
-
- uint tid = threadIdx.x;
- uint index = blockIdx.x*blockDim.x + threadIdx.x;
-
- uint32_t *idata = window + blockIdx.x * blockDim.x;
-
- if (index >= n) return;
-
- for (int stride = blockDim.x/2; stride > 0; stride>>=1){
- if (tid < stride){
- idata[tid] += idata[tid+stride];
- }
- __syncthreads();
- }
-
- if(tid == 0){
- dev_data[blockIdx.x] = idata[0];
- }
- }
- int main(void) {
- uint32_t iterations_id;
- uint64_t aggregated_value;
-
- int n = elements;
-
- //define number of blocks and number of threads per block (kernel parameters)
- dim3 threads_per_block (1);
- dim3 blocks ((elements+threads_per_block.x-1)/threads_per_block.x);
-
- gettimeofday(&start, NULL);
- // create input streams:
- for (iterations_id=0; iterations_id<NUM_ITERATIONS; iterations_id++) {
- // dynamic allocation of the memory needed for all the elements
- uint32_t *window;
- window = (uint32_t*)calloc(elements, sizeof(uint32_t));
-
- // check if there's enough space for the allocation
- if(!window){
- printf("Allocation error for window - aborting.\n");
- exit(1);
- }
-
- // dynamic allocation of the reduced data matrix
- uint32_t *h_data = (uint32_t *)malloc(blocks.x*sizeof(uint32_t));
-
- if(!h_data){
- printf("Allocation error for h_data - aborting.\n");
- exit(1);
- }
-
- uint64_t ag_val = 0;
-
- // initialization - fill in the window with random numbers:
- for (int i = 0; i < elements; i++) {
- window[i] = (rand()%1000);
- ag_val += window[i];
- }
- printf("TEST %d\n", ag_val);
-
- //GPU memory pointers
- uint32_t *dev_window;
- uint32_t *dev_data;
- //allocate the memory on the GPU
- cudaMalloc((void**)&dev_window, elements*sizeof(uint32_t));
- cudaMalloc((void**)&dev_data, blocks.x*sizeof(uint32_t));
- //copy the array 'window' to the GPU
- cudaMemcpy(dev_window, window, elements*sizeof(uint32_t), cudaMemcpyHostToDevice);
- //launch kernel
- output_thread_aggregation<<<blocks,threads_per_block>>>(dev_window, dev_data, n);
- cudaDeviceSynchronize();
- //copy back the result to the CPU
- cudaMemcpy(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
- aggregated_value = 0;
-
- for(int i=0; i<blocks.x; i++){
- aggregated_value += h_data[i];
- }
-
- cudaDeviceSynchronize();
- printf("iter: %d - aggregated value: %lu\n", iterations_id, aggregated_value);
-
- //free the memory allocated on the GPU
- cudaFree(dev_window);
- cudaFree(dev_data);
- }
- gettimeofday(&end, NULL);
- printf("usec: %ld\n", ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)));
- return 0;
-
- }
|