nf_mm_task_build.f90 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. ! StarPU --- Runtime system for heterogeneous multicore architectures.
  2. !
  3. ! Copyright (C) 2017, 2019 CNRS
  4. ! Copyright (C) 2016 Inria
  5. ! Copyright (C) 2016 Université de Bordeaux
  6. !
  7. ! StarPU is free software; you can redistribute it and/or modify
  8. ! it under the terms of the GNU Lesser General Public License as published by
  9. ! the Free Software Foundation; either version 2.1 of the License, or (at
  10. ! your option) any later version.
  11. !
  12. ! StarPU is distributed in the hope that it will be useful, but
  13. ! WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. !
  16. ! See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. !
  18. program nf_mm
  19. use iso_c_binding ! C interfacing module
  20. use fstarpu_mod ! StarPU interfacing module
  21. use fstarpu_mpi_mod ! StarPU-MPI interfacing module
  22. use nf_mm_cl
  23. implicit none
  24. logical, parameter :: verbose = .false.
  25. integer(c_int) :: comm_size, comm_rank
  26. integer(c_int), target :: comm_world
  27. integer(c_int) :: N = 16, BS = 4, NB
  28. real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
  29. type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
  30. type(c_ptr) :: cl_mm
  31. type(c_ptr) :: task
  32. integer(c_int) :: ncpu
  33. integer(c_int) :: ret
  34. integer(c_int) :: row, col
  35. integer(c_int) :: b_row, b_col
  36. integer(c_int) :: mr, tag, rank
  37. ret = fstarpu_init(C_NULL_PTR)
  38. if (ret == -19) then
  39. stop 77
  40. else if (ret /= 0) then
  41. stop 1
  42. end if
  43. ret = fstarpu_mpi_init(1)
  44. print *,"fstarpu_mpi_init status:", ret
  45. if (ret /= 0) then
  46. stop 1
  47. end if
  48. ! stop there if no CPU worker available
  49. ncpu = fstarpu_cpu_worker_get_count()
  50. if (ncpu == 0) then
  51. call fstarpu_shutdown()
  52. stop 77
  53. end if
  54. comm_world = fstarpu_mpi_world_comm()
  55. comm_size = fstarpu_mpi_world_size()
  56. comm_rank = fstarpu_mpi_world_rank()
  57. if (comm_size < 2) then
  58. call fstarpu_shutdown()
  59. ret = fstarpu_mpi_shutdown()
  60. stop 77
  61. end if
  62. ! TODO: process app's argc/argv
  63. NB = N/BS
  64. ! allocate and initialize codelet
  65. cl_mm = fstarpu_codelet_allocate()
  66. call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
  67. call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
  68. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
  69. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
  70. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
  71. ! allocate matrices
  72. if (comm_rank == 0) then
  73. allocate(A(N,N))
  74. allocate(B(N,N))
  75. allocate(C(N,N))
  76. end if
  77. ! init matrices
  78. if (comm_rank == 0) then
  79. do col=1,N
  80. do row=1,N
  81. if (row == col) then
  82. A(row,col) = 2
  83. else
  84. A(row,col) = 0
  85. end if
  86. B(row,col) = row*N+col
  87. C(row,col) = 0
  88. end do
  89. end do
  90. if (verbose) then
  91. print *,"A"
  92. call mat_disp(A)
  93. print *,"B"
  94. call mat_disp(B)
  95. print *,"C"
  96. call mat_disp(C)
  97. end if
  98. end if
  99. ! allocate data handles
  100. allocate(dh_A(NB))
  101. allocate(dh_B(NB))
  102. allocate(dh_C(NB,NB))
  103. ! register matrices
  104. if (comm_rank == 0) then
  105. mr = 0 ! TODO: use STARPU_MAIN_RAM constant
  106. else
  107. mr = -1
  108. end if
  109. tag = 0
  110. do b_row=1,NB
  111. if (comm_rank == 0) then
  112. call fstarpu_matrix_data_register(dh_A(b_row), mr, &
  113. c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
  114. else
  115. call fstarpu_matrix_data_register(dh_A(b_row), mr, &
  116. c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
  117. end if
  118. call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
  119. tag = tag+1
  120. end do
  121. do b_col=1,NB
  122. if (comm_rank == 0) then
  123. call fstarpu_matrix_data_register(dh_B(b_col), mr, &
  124. c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
  125. else
  126. call fstarpu_matrix_data_register(dh_B(b_col), mr, &
  127. c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
  128. end if
  129. call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
  130. tag = tag+1
  131. end do
  132. do b_col=1,NB
  133. do b_row=1,NB
  134. if (comm_rank == 0) then
  135. call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
  136. c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
  137. else
  138. call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
  139. c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
  140. end if
  141. call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
  142. tag = tag+1
  143. end do
  144. end do
  145. ! distribute matrix C
  146. do b_col=1,NB
  147. do b_row=1,NB
  148. rank = modulo(b_row+b_col, comm_size)
  149. call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), rank)
  150. end do
  151. end do
  152. do b_col=1,NB
  153. do b_row=1,NB
  154. task = fstarpu_mpi_task_build((/ c_loc(comm_world), cl_mm, &
  155. FSTARPU_R, dh_A(b_row), &
  156. FSTARPU_R, dh_B(b_col), &
  157. FSTARPU_RW, dh_C(b_row,b_col), &
  158. C_NULL_PTR /))
  159. if (c_associated(task)) then
  160. ret = fstarpu_task_submit(task)
  161. endif
  162. call fstarpu_mpi_task_post_build((/ c_loc(comm_world), cl_mm, &
  163. FSTARPU_R, dh_A(b_row), &
  164. FSTARPU_R, dh_B(b_col), &
  165. FSTARPU_RW, dh_C(b_row,b_col), &
  166. C_NULL_PTR /))
  167. end do
  168. end do
  169. call fstarpu_task_wait_for_all()
  170. ! undistribute matrix C
  171. do b_col=1,NB
  172. do b_row=1,NB
  173. call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), 0)
  174. end do
  175. end do
  176. ! unregister matrices
  177. do b_row=1,NB
  178. call fstarpu_data_unregister(dh_A(b_row))
  179. end do
  180. do b_col=1,NB
  181. call fstarpu_data_unregister(dh_B(b_col))
  182. end do
  183. do b_col=1,NB
  184. do b_row=1,NB
  185. call fstarpu_data_unregister(dh_C(b_row,b_col))
  186. end do
  187. end do
  188. ! check result
  189. if (comm_rank == 0) then
  190. if (verbose) then
  191. print *,"final C"
  192. call mat_disp(C)
  193. end if
  194. do col=1,N
  195. do row=1,N
  196. if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
  197. print *, "check failed"
  198. stop 1
  199. end if
  200. end do
  201. end do
  202. end if
  203. ! free handles
  204. deallocate(dh_A)
  205. deallocate(dh_B)
  206. deallocate(dh_C)
  207. ! free matrices
  208. if (comm_rank == 0) then
  209. deallocate(A)
  210. deallocate(B)
  211. deallocate(C)
  212. end if
  213. call fstarpu_codelet_free(cl_mm)
  214. call fstarpu_shutdown()
  215. ret = fstarpu_mpi_shutdown()
  216. print *,"fstarpu_mpi_shutdown status:", ret
  217. if (ret /= 0) then
  218. stop 1
  219. end if
  220. end program nf_mm