axpy_partition_gpu.h 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #pragma once
  2. __device__ static uint get_smid(void) {
  3. #if defined(__CUDACC__)
  4. uint ret;
  5. asm("mov.u32 %0, %smid;" : "=r"(ret) );
  6. return ret;
  7. #else
  8. return 0;
  9. #endif
  10. }
  11. #define __P_HKARGS dimGrid, active_blocks ,occupancy, block_assignment_d, mapping_start
  12. #define __P_KARGS dim3 blocks, int active_blocks, int occupancy, unsigned int* block_assignment, int mapping_start
  13. #define __P_DARGS blocks,blockid
  14. #define __P_BEGIN \
  15. __shared__ unsigned int block_start; \
  16. int smid = get_smid(); \
  17. if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) \
  18. { \
  19. block_start = atomicDec(&block_assignment[smid],0xDEADBEEF); \
  20. } \
  21. __syncthreads(); \
  22. \
  23. if(block_start > active_blocks) \
  24. { \
  25. return; \
  26. }
  27. #define __P_LOOPXY \
  28. dim3 blockid; \
  29. blockid.z = 0; \
  30. \
  31. int gridDim_sum = blocks.x*blocks.y; \
  32. int startBlock = block_start + (smid - mapping_start) * occupancy; \
  33. \
  34. for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
  35. { \
  36. blockid.x = blockid_sum % blocks.x; \
  37. blockid.y = blockid_sum / blocks.x;
  38. #define __P_LOOPEND }
  39. // Needed if shared memory is used
  40. #define __P_LOOPEND_SAFE __syncthreads(); }
  41. #define __P_LOOPX \
  42. dim3 blockid; \
  43. blockid.z = 0; \
  44. blockid.y = 0; \
  45. int gridDim_sum = blocks.x; \
  46. int startBlock = (smid-mapping_start) + block_start*(active_blocks/occupancy); \
  47. \
  48. for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
  49. { \
  50. blockid.x = blockid_sum;
  51. // int startBlock = block_start + (smid - mapping_start) * occupancy; \
  52. //////////// HOST side functions
  53. template <typename F>
  54. static void buildPartitionedBlockMapping(F cudaFun, int threads, int shmem, int mapping_start, int allocation,
  55. int &width, int &active_blocks, unsigned int *block_assignment_d,cudaStream_t current_stream = cudaStreamPerThread)
  56. {
  57. int occupancy;
  58. int nb_SM = 13; //TODO: replace with call
  59. int mapping_end = mapping_start + allocation - 1; // exclusive
  60. unsigned int block_assignment[15];
  61. cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy,cudaFun,threads,shmem);
  62. //occupancy = 4;
  63. width = occupancy * nb_SM; // Physical wrapper grid size. Fits GPU exactly
  64. active_blocks = occupancy*allocation; // The total number of blocks doing work
  65. for(int i = 0; i < mapping_start; i++)
  66. block_assignment[i] = (unsigned) -1;
  67. for(int i = mapping_start; i <= mapping_end; i++)
  68. {
  69. block_assignment[i] = occupancy - 1;
  70. }
  71. for(int i = mapping_end+1; i < nb_SM; i++)
  72. block_assignment[i] = (unsigned) -1;
  73. cudaMemcpyAsync((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice, current_stream);
  74. //cudaMemcpy((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice);
  75. }
  76. #define __P_HOSTSETUP(KERNEL,GRIDDIM,BLOCKSIZE,SHMEMSIZE,MAPPING_START,MAPPING_END,STREAM) \
  77. unsigned int* block_assignment_d; cudaMalloc((void**) &block_assignment_d,15*sizeof(unsigned int)); \
  78. int width = 0; \
  79. int active_blocks = 0; \
  80. buildPartitionedBlockMapping(KERNEL,BLOCKSIZE,SHMEMSIZE,(MAPPING_START),(MAPPING_END)-(MAPPING_START), \
  81. width, active_blocks, block_assignment_d,STREAM); \
  82. int occupancy = active_blocks/((MAPPING_END)-(MAPPING_START)); \
  83. dim3 dimGrid = (GRIDDIM);\
  84. int mapping_start = (MAPPING_START);