axpy_partition_gpu.h 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2016 Inria
  4. * Copyright (C) 2017 CNRS
  5. * Copyright (C) 2016,2019 Université de Bordeaux
  6. * Copyright (C) 2016 Uppsala University
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. /*
  20. * This creates two dumb vectors, splits them into chunks, and for each pair of
  21. * chunk, run axpy on them.
  22. */
  23. #pragma once
  24. __device__ static uint get_smid(void)
  25. {
  26. #if defined(__CUDACC__)
  27. uint ret;
  28. asm("mov.u32 %0, %smid;" : "=r"(ret) );
  29. return ret;
  30. #else
  31. return 0;
  32. #endif
  33. }
  34. #define __P_HKARGS dimGrid, active_blocks ,occupancy, block_assignment_d, mapping_start
  35. #define __P_KARGS dim3 blocks, int active_blocks, int occupancy, unsigned int* block_assignment, int mapping_start
  36. #define __P_DARGS blocks,blockid
  37. #define __P_BEGIN \
  38. __shared__ unsigned int block_start; \
  39. int smid = get_smid(); \
  40. if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) \
  41. { \
  42. block_start = atomicDec(&block_assignment[smid],0xDEADBEEF); \
  43. } \
  44. __syncthreads(); \
  45. \
  46. if(block_start > active_blocks) \
  47. { \
  48. return; \
  49. }
  50. #define __P_LOOPXY \
  51. dim3 blockid; \
  52. blockid.z = 0; \
  53. \
  54. int gridDim_sum = blocks.x*blocks.y; \
  55. int startBlock = block_start + (smid - mapping_start) * occupancy; \
  56. \
  57. for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
  58. { \
  59. blockid.x = blockid_sum % blocks.x; \
  60. blockid.y = blockid_sum / blocks.x;
  61. #define __P_LOOPEND }
  62. // Needed if shared memory is used
  63. #define __P_LOOPEND_SAFE __syncthreads(); }
  64. #define __P_LOOPX \
  65. dim3 blockid; \
  66. blockid.z = 0; \
  67. blockid.y = 0; \
  68. int gridDim_sum = blocks.x; \
  69. int startBlock = (smid-mapping_start) + block_start*(active_blocks/occupancy); \
  70. \
  71. for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
  72. { \
  73. blockid.x = blockid_sum;
  74. // int startBlock = block_start + (smid - mapping_start) * occupancy; \
  75. //////////// HOST side functions
  76. template <typename F>
  77. static void buildPartitionedBlockMapping(F cudaFun, int threads, int shmem, int mapping_start, int allocation,
  78. int &width, int &active_blocks, unsigned int *block_assignment_d,cudaStream_t current_stream =
  79. #ifdef cudaStreamPerThread
  80. cudaStreamPerThread
  81. #else
  82. NULL
  83. #endif
  84. )
  85. {
  86. int occupancy;
  87. int nb_SM = 13; //TODO: replace with call
  88. int mapping_end = mapping_start + allocation - 1; // exclusive
  89. unsigned int block_assignment[15];
  90. #if CUDART_VERSION >= 6050
  91. cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy,cudaFun,threads,shmem);
  92. #else
  93. occupancy = 4;
  94. #endif
  95. width = occupancy * nb_SM; // Physical wrapper grid size. Fits GPU exactly
  96. active_blocks = occupancy*allocation; // The total number of blocks doing work
  97. for(int i = 0; i < mapping_start; i++)
  98. block_assignment[i] = (unsigned) -1;
  99. for(int i = mapping_start; i <= mapping_end; i++)
  100. {
  101. block_assignment[i] = occupancy - 1;
  102. }
  103. for(int i = mapping_end+1; i < nb_SM; i++)
  104. block_assignment[i] = (unsigned) -1;
  105. cudaMemcpyAsync((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice, current_stream);
  106. //cudaMemcpy((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice);
  107. //cudaThreadSynchronize();
  108. }
  109. #define __P_HOSTSETUP(KERNEL,GRIDDIM,BLOCKSIZE,SHMEMSIZE,MAPPING_START,MAPPING_END,STREAM) \
  110. unsigned int* block_assignment_d; cudaMalloc((void**) &block_assignment_d,15*sizeof(unsigned int)); \
  111. int width = 0; \
  112. int active_blocks = 0; \
  113. buildPartitionedBlockMapping(KERNEL,BLOCKSIZE,SHMEMSIZE,(MAPPING_START),(MAPPING_END)-(MAPPING_START), \
  114. width, active_blocks, block_assignment_d,STREAM); \
  115. int occupancy = active_blocks/((MAPPING_END)-(MAPPING_START)); \
  116. dim3 dimGrid = (GRIDDIM);\
  117. int mapping_start = (MAPPING_START);