reduction.cu 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. #include <stdio.h>
  2. #include<stdlib.h>
  3. #include <sys/time.h>
  4. // platform independent data types:
  5. #include <stdint.h>
  6. // Number of iterations
  7. #define NUM_ITERATIONS 10
  8. // The window height is equal to the number of streams
  9. #define NUM_INPUT_STREAMS WINDOW_HEIGHT
  10. #define WINDOW_HEIGHT 4
  11. // The window width is equal to number of tuples required to fill in a window
  12. #define WINDOW_WIDTH 100
  13. // The total number of elements is equal to the window_height times window_width
  14. #define elements WINDOW_HEIGHT*WINDOW_WIDTH
  15. // measure time:
  16. struct timeval start, end;
  17. // output streams (aggregator):
  18. __global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n) {
  19. uint tid = threadIdx.x;
  20. uint index = blockIdx.x*blockDim.x + threadIdx.x;
  21. uint32_t *idata = window + blockIdx.x * blockDim.x;
  22. if (index >= n) return;
  23. for (int stride = blockDim.x/2; stride > 0; stride>>=1){
  24. if (tid < stride){
  25. idata[tid] += idata[tid+stride];
  26. }
  27. __syncthreads();
  28. }
  29. if(tid == 0){
  30. dev_data[blockIdx.x] = idata[0];
  31. }
  32. }
  33. int main(void) {
  34. uint32_t iterations_id;
  35. uint64_t aggregated_value;
  36. int n = elements;
  37. //define number of blocks and number of threads per block (kernel parameters)
  38. dim3 threads_per_block (1);
  39. dim3 blocks ((elements+threads_per_block.x-1)/threads_per_block.x);
  40. gettimeofday(&start, NULL);
  41. // create input streams:
  42. for (iterations_id=0; iterations_id<NUM_ITERATIONS; iterations_id++) {
  43. // dynamic allocation of the memory needed for all the elements
  44. uint32_t *window;
  45. window = (uint32_t*)calloc(elements, sizeof(uint32_t));
  46. // check if there's enough space for the allocation
  47. if(!window){
  48. printf("Allocation error for window - aborting.\n");
  49. exit(1);
  50. }
  51. // dynamic allocation of the reduced data matrix
  52. uint32_t *h_data = (uint32_t *)malloc(blocks.x*sizeof(uint32_t));
  53. if(!h_data){
  54. printf("Allocation error for h_data - aborting.\n");
  55. exit(1);
  56. }
  57. uint64_t ag_val = 0;
  58. // initialization - fill in the window with random numbers:
  59. for (int i = 0; i < elements; i++) {
  60. window[i] = (rand()%1000);
  61. ag_val += window[i];
  62. }
  63. printf("TEST %d\n", ag_val);
  64. //GPU memory pointers
  65. uint32_t *dev_window;
  66. uint32_t *dev_data;
  67. //allocate the memory on the GPU
  68. cudaMalloc((void**)&dev_window, elements*sizeof(uint32_t));
  69. cudaMalloc((void**)&dev_data, blocks.x*sizeof(uint32_t));
  70. //copy the array 'window' to the GPU
  71. cudaMemcpy(dev_window, window, elements*sizeof(uint32_t), cudaMemcpyHostToDevice);
  72. //launch kernel
  73. output_thread_aggregation<<<blocks,threads_per_block>>>(dev_window, dev_data, n);
  74. cudaDeviceSynchronize();
  75. //copy back the result to the CPU
  76. cudaMemcpy(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost);
  77. aggregated_value = 0;
  78. for(int i=0; i<blocks.x; i++){
  79. aggregated_value += h_data[i];
  80. }
  81. cudaDeviceSynchronize();
  82. printf("iter: %d - aggregated value: %lu\n", iterations_id, aggregated_value);
  83. //free the memory allocated on the GPU
  84. cudaFree(dev_window);
  85. cudaFree(dev_data);
  86. }
  87. gettimeofday(&end, NULL);
  88. printf("usec: %ld\n", ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)));
  89. return 0;
  90. }