gpu_nbody.cu 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #include <stdio.h>
  2. #include <stdint.h>
  3. #include <math.h>
  4. #include <starpu.h>
  5. struct Params
  6. {
  7. unsigned taskx;
  8. unsigned epsilon;
  9. };
  10. __global__ void gpuNbodyKernel(double *P, double *subA, double *M,
  11. uint32_t nxP, uint32_t nxA, uint32_t nxM,
  12. uint32_t ldP, uint32_t ldA,
  13. struct Params params)
  14. {
  15. uint32_t id, i, j, k;
  16. double dx, dy, modul;
  17. id = blockIdx.x * blockDim.x + threadIdx.x;
  18. i = id % nxA;
  19. j = id / nxA;
  20. if (j >= 1){
  21. return;
  22. }
  23. double sumaccx;
  24. double sumaccy;
  25. for (k = 0; k < nxP; k++){
  26. if (k != id + nxA*params.taskx){
  27. dx = P[k] - P[id + nxA*params.taskx];
  28. dy = P[k + ldP] - P[id + nxA*params.taskx + ldP];
  29. modul = dx * dx + dy * dy;
  30. sumaccx = 6.674e-11 * M[k] * dx / pow(modul + params.epsilon, 3);
  31. sumaccy = 6.674e-11 * M[k] * dy / pow(modul + params.epsilon, 3);
  32. }
  33. }
  34. subA[i] = sumaccx;
  35. subA[i + ldA] = sumaccy;
  36. // P[id + nxA * params.taskx] = subA[i];
  37. // subA[i] = 0;
  38. // subA[i + ldA] = 1;
  39. }
  40. #define THREADS_PER_BLOCK 64
  41. extern "C" void gpu_nbody(void * descr[], void * args)
  42. {
  43. double *d_P, *d_subA, *d_M;
  44. uint32_t nxP, nxA, nxM;
  45. uint32_t ldA, ldP;
  46. uint32_t nblocks;
  47. struct Params *params = (struct Params *) args;
  48. d_P = (double *) STARPU_MATRIX_GET_PTR(descr[0]);
  49. d_subA = (double *) STARPU_MATRIX_GET_PTR(descr[1]);
  50. d_M = (double *) STARPU_MATRIX_GET_PTR(descr[2]);
  51. nxP = STARPU_MATRIX_GET_NX(descr[0]);
  52. nxA = STARPU_MATRIX_GET_NX(descr[1]);
  53. nxM = STARPU_MATRIX_GET_NX(descr[2]);
  54. ldP = STARPU_MATRIX_GET_LD(descr[0]);
  55. ldA = STARPU_MATRIX_GET_LD(descr[1]);
  56. nblocks = (nxA + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
  57. gpuNbodyKernel
  58. <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
  59. >>> (d_P, d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
  60. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  61. }
  62. __global__ void gpuNbody2Kernel(double *d_subP, double *d_subV, double *d_subA,
  63. uint32_t nxP, uint32_t nxV, uint32_t nxA,
  64. uint32_t ldP, uint32_t ldV, uint32_t ldA,
  65. struct Params params)
  66. {
  67. uint32_t id, i, j;
  68. id = blockIdx.x * blockDim.x + threadIdx.x;
  69. i = id % nxP;
  70. j = id / nxP;
  71. if (j >= 1){
  72. return;
  73. }
  74. d_subV[i] = d_subV[i] + 3600*d_subA[i];
  75. d_subV[i + ldV] = d_subV[i + ldV] + 3600*d_subA[i + ldA];
  76. d_subP[i] = d_subP[i] + 3600*d_subV[i];
  77. d_subP[i + ldP] = d_subP[i + ldP] + 3600*d_subV[i + ldV];
  78. }
  79. extern "C" void gpu_nbody2(void * descr[], void *args)
  80. {
  81. double *d_subP, *d_subV, *d_subA;
  82. uint32_t nxP, nxV, nxA;
  83. uint32_t ldP, ldV, ldA;
  84. uint32_t nblocks;
  85. struct Params *params = (struct Params *) args;
  86. d_subP = (double *) STARPU_MATRIX_GET_PTR(descr[0]);
  87. d_subV = (double *) STARPU_MATRIX_GET_PTR(descr[1]);
  88. d_subA = (double *) STARPU_MATRIX_GET_PTR(descr[2]);
  89. nxP = STARPU_MATRIX_GET_NX(descr[0]);
  90. nxV = STARPU_MATRIX_GET_NX(descr[1]);
  91. nxA = STARPU_MATRIX_GET_NX(descr[2]);
  92. ldP = STARPU_MATRIX_GET_LD(descr[0]);
  93. ldV = STARPU_MATRIX_GET_LD(descr[1]);
  94. ldA = STARPU_MATRIX_GET_LD(descr[2]);
  95. nblocks = (nxA + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
  96. gpuNbody2Kernel
  97. <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
  98. >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
  99. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  100. }