nf_mm_task_build.f90 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. ! StarPU --- Runtime system for heterogeneous multicore architectures.
  2. !
  3. ! Copyright (C) 2016-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. !
  5. ! StarPU is free software; you can redistribute it and/or modify
  6. ! it under the terms of the GNU Lesser General Public License as published by
  7. ! the Free Software Foundation; either version 2.1 of the License, or (at
  8. ! your option) any later version.
  9. !
  10. ! StarPU is distributed in the hope that it will be useful, but
  11. ! WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. !
  14. ! See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. !
  16. program nf_mm
  17. use iso_c_binding ! C interfacing module
  18. use fstarpu_mod ! StarPU interfacing module
  19. use fstarpu_mpi_mod ! StarPU-MPI interfacing module
  20. use nf_mm_cl
  21. implicit none
  22. logical, parameter :: verbose = .false.
  23. integer(c_int) :: comm_size, comm_rank
  24. integer(c_int), target :: comm_world
  25. integer(c_int) :: N = 16, BS = 4, NB
  26. real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
  27. type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
  28. type(c_ptr) :: cl_mm
  29. type(c_ptr) :: task
  30. integer(c_int) :: ncpu
  31. integer(c_int) :: ret
  32. integer(c_int) :: row, col
  33. integer(c_int) :: b_row, b_col
  34. integer(c_int) :: mr, rank
  35. integer(c_int64_t) :: tag
  36. ret = fstarpu_init(C_NULL_PTR)
  37. if (ret == -19) then
  38. stop 77
  39. else if (ret /= 0) then
  40. stop 1
  41. end if
  42. ret = fstarpu_mpi_init(1)
  43. print *,"fstarpu_mpi_init status:", ret
  44. if (ret /= 0) then
  45. stop 1
  46. end if
  47. ! stop there if no CPU worker available
  48. ncpu = fstarpu_cpu_worker_get_count()
  49. if (ncpu == 0) then
  50. call fstarpu_shutdown()
  51. stop 77
  52. end if
  53. comm_world = fstarpu_mpi_world_comm()
  54. comm_size = fstarpu_mpi_world_size()
  55. comm_rank = fstarpu_mpi_world_rank()
  56. if (comm_size < 2) then
  57. call fstarpu_shutdown()
  58. ret = fstarpu_mpi_shutdown()
  59. stop 77
  60. end if
  61. ! TODO: process app's argc/argv
  62. NB = N/BS
  63. ! allocate and initialize codelet
  64. cl_mm = fstarpu_codelet_allocate()
  65. call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
  66. call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
  67. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
  68. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
  69. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
  70. ! allocate matrices
  71. if (comm_rank == 0) then
  72. allocate(A(N,N))
  73. allocate(B(N,N))
  74. allocate(C(N,N))
  75. end if
  76. ! init matrices
  77. if (comm_rank == 0) then
  78. do col=1,N
  79. do row=1,N
  80. if (row == col) then
  81. A(row,col) = 2
  82. else
  83. A(row,col) = 0
  84. end if
  85. B(row,col) = row*N+col
  86. C(row,col) = 0
  87. end do
  88. end do
  89. if (verbose) then
  90. print *,"A"
  91. call mat_disp(A)
  92. print *,"B"
  93. call mat_disp(B)
  94. print *,"C"
  95. call mat_disp(C)
  96. end if
  97. end if
  98. ! allocate data handles
  99. allocate(dh_A(NB))
  100. allocate(dh_B(NB))
  101. allocate(dh_C(NB,NB))
  102. ! register matrices
  103. if (comm_rank == 0) then
  104. mr = 0 ! TODO: use STARPU_MAIN_RAM constant
  105. else
  106. mr = -1
  107. end if
  108. tag = 0
  109. do b_row=1,NB
  110. if (comm_rank == 0) then
  111. call fstarpu_matrix_data_register(dh_A(b_row), mr, &
  112. c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
  113. else
  114. call fstarpu_matrix_data_register(dh_A(b_row), mr, &
  115. c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
  116. end if
  117. call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
  118. tag = tag+1
  119. end do
  120. do b_col=1,NB
  121. if (comm_rank == 0) then
  122. call fstarpu_matrix_data_register(dh_B(b_col), mr, &
  123. c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
  124. else
  125. call fstarpu_matrix_data_register(dh_B(b_col), mr, &
  126. c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
  127. end if
  128. call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
  129. tag = tag+1
  130. end do
  131. do b_col=1,NB
  132. do b_row=1,NB
  133. if (comm_rank == 0) then
  134. call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
  135. c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
  136. else
  137. call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
  138. c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
  139. end if
  140. call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
  141. tag = tag+1
  142. end do
  143. end do
  144. ! distribute matrix C
  145. do b_col=1,NB
  146. do b_row=1,NB
  147. rank = modulo(b_row+b_col, comm_size)
  148. call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), rank)
  149. end do
  150. end do
  151. do b_col=1,NB
  152. do b_row=1,NB
  153. task = fstarpu_mpi_task_build((/ c_loc(comm_world), cl_mm, &
  154. FSTARPU_R, dh_A(b_row), &
  155. FSTARPU_R, dh_B(b_col), &
  156. FSTARPU_RW, dh_C(b_row,b_col), &
  157. C_NULL_PTR /))
  158. if (c_associated(task)) then
  159. ret = fstarpu_task_submit(task)
  160. endif
  161. call fstarpu_mpi_task_post_build((/ c_loc(comm_world), cl_mm, &
  162. FSTARPU_R, dh_A(b_row), &
  163. FSTARPU_R, dh_B(b_col), &
  164. FSTARPU_RW, dh_C(b_row,b_col), &
  165. C_NULL_PTR /))
  166. end do
  167. end do
  168. call fstarpu_task_wait_for_all()
  169. ! undistribute matrix C
  170. do b_col=1,NB
  171. do b_row=1,NB
  172. call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), 0)
  173. end do
  174. end do
  175. ! unregister matrices
  176. do b_row=1,NB
  177. call fstarpu_data_unregister(dh_A(b_row))
  178. end do
  179. do b_col=1,NB
  180. call fstarpu_data_unregister(dh_B(b_col))
  181. end do
  182. do b_col=1,NB
  183. do b_row=1,NB
  184. call fstarpu_data_unregister(dh_C(b_row,b_col))
  185. end do
  186. end do
  187. ! check result
  188. if (comm_rank == 0) then
  189. if (verbose) then
  190. print *,"final C"
  191. call mat_disp(C)
  192. end if
  193. do col=1,N
  194. do row=1,N
  195. if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
  196. print *, "check failed"
  197. stop 1
  198. end if
  199. end do
  200. end do
  201. end if
  202. ! free handles
  203. deallocate(dh_A)
  204. deallocate(dh_B)
  205. deallocate(dh_C)
  206. ! free matrices
  207. if (comm_rank == 0) then
  208. deallocate(A)
  209. deallocate(B)
  210. deallocate(C)
  211. end if
  212. call fstarpu_codelet_free(cl_mm)
  213. call fstarpu_shutdown()
  214. ret = fstarpu_mpi_shutdown()
  215. print *,"fstarpu_mpi_shutdown status:", ret
  216. if (ret /= 0) then
  217. stop 1
  218. end if
  219. end program nf_mm