nf_mm.f90 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. ! StarPU --- Runtime system for heterogeneous multicore architectures.
  2. !
  3. ! Copyright (C) 2017, 2019 CNRS
  4. ! Copyright (C) 2016 Inria
  5. ! Copyright (C) 2016 Université de Bordeaux
  6. !
  7. ! StarPU is free software; you can redistribute it and/or modify
  8. ! it under the terms of the GNU Lesser General Public License as published by
  9. ! the Free Software Foundation; either version 2.1 of the License, or (at
  10. ! your option) any later version.
  11. !
  12. ! StarPU is distributed in the hope that it will be useful, but
  13. ! WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. !
  16. ! See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. !
  18. program nf_mm
  19. use iso_c_binding ! C interfacing module
  20. use fstarpu_mod ! StarPU interfacing module
  21. use fstarpu_mpi_mod ! StarPU-MPI interfacing module
  22. use nf_mm_cl
  23. implicit none
  24. logical, parameter :: verbose = .false.
  25. integer(c_int) :: comm_size, comm_rank
  26. integer(c_int), target :: comm_world
  27. integer(c_int) :: N = 16, BS = 4, NB
  28. real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
  29. type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
  30. type(c_ptr) :: cl_mm
  31. integer(c_int) :: ncpu
  32. integer(c_int) :: ret
  33. integer(c_int) :: row, col
  34. integer(c_int) :: b_row, b_col
  35. integer(c_int) :: mr, tag, rank
  36. ret = fstarpu_init(C_NULL_PTR)
  37. if (ret == -19) then
  38. stop 77
  39. else if (ret /= 0) then
  40. stop 1
  41. end if
  42. ret = fstarpu_mpi_init(1)
  43. print *,"fstarpu_mpi_init status:", ret
  44. if (ret /= 0) then
  45. stop 1
  46. end if
  47. ! stop there if no CPU worker available
  48. ncpu = fstarpu_cpu_worker_get_count()
  49. if (ncpu == 0) then
  50. call fstarpu_shutdown()
  51. stop 77
  52. end if
  53. comm_world = fstarpu_mpi_world_comm()
  54. comm_size = fstarpu_mpi_world_size()
  55. comm_rank = fstarpu_mpi_world_rank()
  56. if (comm_size < 2) then
  57. call fstarpu_shutdown()
  58. ret = fstarpu_mpi_shutdown()
  59. stop 77
  60. end if
  61. ! TODO: process app's argc/argv
  62. NB = N/BS
  63. ! allocate and initialize codelet
  64. cl_mm = fstarpu_codelet_allocate()
  65. call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
  66. call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
  67. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
  68. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
  69. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
  70. ! allocate matrices
  71. if (comm_rank == 0) then
  72. allocate(A(N,N))
  73. allocate(B(N,N))
  74. allocate(C(N,N))
  75. end if
  76. ! init matrices
  77. if (comm_rank == 0) then
  78. do col=1,N
  79. do row=1,N
  80. if (row == col) then
  81. A(row,col) = 2
  82. else
  83. A(row,col) = 0
  84. end if
  85. B(row,col) = row*N+col
  86. C(row,col) = 0
  87. end do
  88. end do
  89. if (verbose) then
  90. print *,"A"
  91. call mat_disp(A)
  92. print *,"B"
  93. call mat_disp(B)
  94. print *,"C"
  95. call mat_disp(C)
  96. end if
  97. end if
  98. ! allocate data handles
  99. allocate(dh_A(NB))
  100. allocate(dh_B(NB))
  101. allocate(dh_C(NB,NB))
  102. ! register matrices
  103. if (comm_rank == 0) then
  104. mr = 0 ! TODO: use STARPU_MAIN_RAM constant
  105. else
  106. mr = -1
  107. end if
  108. tag = 0
  109. do b_row=1,NB
  110. if (comm_rank == 0) then
  111. call fstarpu_matrix_data_register(dh_A(b_row), mr, &
  112. c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
  113. else
  114. call fstarpu_matrix_data_register(dh_A(b_row), mr, &
  115. c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
  116. end if
  117. call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
  118. tag = tag+1
  119. end do
  120. do b_col=1,NB
  121. if (comm_rank == 0) then
  122. call fstarpu_matrix_data_register(dh_B(b_col), mr, &
  123. c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
  124. else
  125. call fstarpu_matrix_data_register(dh_B(b_col), mr, &
  126. c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
  127. end if
  128. call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
  129. tag = tag+1
  130. end do
  131. do b_col=1,NB
  132. do b_row=1,NB
  133. if (comm_rank == 0) then
  134. call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
  135. c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
  136. else
  137. call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
  138. c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
  139. end if
  140. call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
  141. tag = tag+1
  142. end do
  143. end do
  144. ! distribute matrix C
  145. do b_col=1,NB
  146. do b_row=1,NB
  147. rank = modulo(b_row+b_col, comm_size)
  148. call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), rank)
  149. end do
  150. end do
  151. do b_col=1,NB
  152. do b_row=1,NB
  153. call fstarpu_mpi_task_insert((/ c_loc(comm_world), cl_mm, &
  154. FSTARPU_R, dh_A(b_row), &
  155. FSTARPU_R, dh_B(b_col), &
  156. FSTARPU_RW, dh_C(b_row,b_col), &
  157. C_NULL_PTR /))
  158. end do
  159. end do
  160. call fstarpu_task_wait_for_all()
  161. ! undistribute matrix C
  162. do b_col=1,NB
  163. do b_row=1,NB
  164. call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), 0)
  165. end do
  166. end do
  167. ! unregister matrices
  168. do b_row=1,NB
  169. call fstarpu_data_unregister(dh_A(b_row))
  170. end do
  171. do b_col=1,NB
  172. call fstarpu_data_unregister(dh_B(b_col))
  173. end do
  174. do b_col=1,NB
  175. do b_row=1,NB
  176. call fstarpu_data_unregister(dh_C(b_row,b_col))
  177. end do
  178. end do
  179. ! check result
  180. if (comm_rank == 0) then
  181. if (verbose) then
  182. print *,"final C"
  183. call mat_disp(C)
  184. end if
  185. do col=1,N
  186. do row=1,N
  187. if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
  188. print *, "check failed"
  189. stop 1
  190. end if
  191. end do
  192. end do
  193. end if
  194. ! free handles
  195. deallocate(dh_A)
  196. deallocate(dh_B)
  197. deallocate(dh_C)
  198. ! free matrices
  199. if (comm_rank == 0) then
  200. deallocate(A)
  201. deallocate(B)
  202. deallocate(C)
  203. end if
  204. call fstarpu_codelet_free(cl_mm)
  205. call fstarpu_shutdown()
  206. ret = fstarpu_mpi_shutdown()
  207. print *,"fstarpu_mpi_shutdown status:", ret
  208. if (ret /= 0) then
  209. stop 1
  210. end if
  211. end program nf_mm