output_thread_aggregation.cu 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #include<stdio.h>
  2. #include <starpu.h>
  3. #include <math.h>
  4. __global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n)
  5. {
  6. uint tid = threadIdx.x;
  7. uint index = blockIdx.x*blockDim.x + threadIdx.x;
  8. // convert global data pointer to the local pointer of the block
  9. uint32_t *idata = window + blockIdx.x * blockDim.x;
  10. if (index >= n) return;
  11. // reduction algorithm
  12. for (int stride = blockDim.x/2; stride > 0; stride>>=1){
  13. if (tid < stride){
  14. idata[tid] += idata[tid+stride];
  15. }
  16. __syncthreads();
  17. }
  18. // write result for this block to global memory
  19. if(tid == 0){
  20. dev_data[blockIdx.x] = idata[0];
  21. }
  22. }
  23. extern "C" void output_thread_aggregation(void *buffers[], void *_args)
  24. {
  25. printf("cuda function\n");
  26. uint64_t *aggregated_value = (uint64_t *)_args;
  27. /* length of the vector */
  28. int n = STARPU_VECTOR_GET_NX(buffers[0]);
  29. uint32_t *window = (uint32_t *)STARPU_VECTOR_GET_PTR(buffers[0]);
  30. /* define the number of threads per block accordingly to the vector's size */
  31. int n_threads;
  32. if (sqrt(n) <= 32)
  33. {n_threads = 32;}
  34. else if (sqrt(n) <= 64)
  35. {n_threads = 64;}
  36. else if (sqrt(n) <= 128)
  37. {n_threads = 128;}
  38. else if (sqrt(n) <= 256)
  39. {n_threads = 256;}
  40. else if (sqrt(n) <= 512)
  41. {n_threads = 512;}
  42. else
  43. {n_threads = 1024;}
  44. printf("n_threads = %lu\n",n_threads);
  45. // define number of blocks and number of threads per block (kernel parameters)
  46. dim3 threads_per_block (n_threads);
  47. dim3 blocks ((n+threads_per_block.x-1)/threads_per_block.x);
  48. // dynamic allocation of the reduced data matrix
  49. uint32_t *h_data = (uint32_t *)malloc(blocks.x*sizeof(uint32_t));
  50. if(!h_data){
  51. printf("Allocation error for h_data - aborting.\n");
  52. exit(1);
  53. }
  54. // GPU memory pointers
  55. uint32_t *dev_window;
  56. uint32_t *dev_data;
  57. // allocate the memory on the GPU
  58. cudaMalloc((void**)&dev_window, n*sizeof(uint32_t));
  59. cudaMalloc((void**)&dev_data, blocks.x*sizeof(uint32_t));
  60. // copy the array 'window' to the GPU
  61. cudaMemcpyAsync(dev_window, window, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
  62. // launch kernel
  63. output_thread_aggregation<<<blocks,threads_per_block>>>(dev_window, dev_data, n);
  64. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  65. // copy back the result to the CPU
  66. cudaMemcpyAsync(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost);
  67. uint64_t gpu_sum = 0;
  68. // compute the total sum from gpu
  69. for(int i=0; i<blocks.x; i++){
  70. gpu_sum += h_data[i];
  71. }
  72. // copy the sum to the main program
  73. *aggregated_value = gpu_sum;
  74. //free the memory allocated on the GPU
  75. cudaFree(dev_window);
  76. cudaFree(dev_data);
  77. }