nf_mm.f90 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. ! StarPU --- Runtime system for heterogeneous multicore architectures.
  2. !
  3. ! Copyright (C) 2016 Inria
  4. !
  5. ! StarPU is free software; you can redistribute it and/or modify
  6. ! it under the terms of the GNU Lesser General Public License as published by
  7. ! the Free Software Foundation; either version 2.1 of the License, or (at
  8. ! your option) any later version.
  9. !
  10. ! StarPU is distributed in the hope that it will be useful, but
  11. ! WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. !
  14. ! See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. program nf_mm
  16. use iso_c_binding ! C interfacing module
  17. use fstarpu_mod ! StarPU interfacing module
  18. use fstarpu_mpi_mod ! StarPU-MPI interfacing module
  19. use nf_mm_cl
  20. implicit none
  21. logical, parameter :: verbose = .false.
  22. integer(c_int) :: comm_rank, comm_size, comm_world
  23. integer(c_int) :: N = 16, BS = 4, NB
  24. real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
  25. type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
  26. type(c_ptr) :: cl_mm
  27. integer(c_int) :: ncpu
  28. integer(c_int) :: ret
  29. integer(c_int) :: row, col
  30. integer(c_int) :: b_row, b_col
  31. integer(c_int) :: mr, tag, rank
  32. ret = fstarpu_mpi_init(1)
  33. print *,"fstarpu_mpi_init status:", ret
  34. if (ret /= 0) then
  35. stop 1
  36. end if
  37. ret = fstarpu_init(C_NULL_PTR)
  38. if (ret == -19) then
  39. stop 77
  40. else if (ret /= 0) then
  41. stop 1
  42. end if
  43. ! stop there if no CPU worker available
  44. ncpu = fstarpu_cpu_worker_get_count()
  45. if (ncpu == 0) then
  46. call fstarpu_shutdown()
  47. stop 77
  48. end if
  49. comm_world = fstarpu_mpi_world_comm()
  50. comm_size = fstarpu_mpi_world_size()
  51. comm_rank = fstarpu_mpi_world_rank()
  52. if (comm_size < 2) then
  53. call fstarpu_shutdown()
  54. ret = fstarpu_mpi_shutdown()
  55. stop 77
  56. end if
  57. ! TODO: process app's argc/argv
  58. NB = N/BS
  59. ! allocate and initialize codelet
  60. cl_mm = fstarpu_codelet_allocate()
  61. call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
  62. call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
  63. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
  64. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
  65. call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
  66. ! allocate matrices
  67. if (comm_rank == 0) then
  68. allocate(A(N,N))
  69. allocate(B(N,N))
  70. allocate(C(N,N))
  71. end if
  72. ! init matrices
  73. if (comm_rank == 0) then
  74. do col=1,N
  75. do row=1,N
  76. if (row == col) then
  77. A(row,col) = 2
  78. else
  79. A(row,col) = 0
  80. end if
  81. B(row,col) = row*N+col
  82. C(row,col) = 0
  83. end do
  84. end do
  85. if (verbose) then
  86. print *,"A"
  87. call mat_disp(A)
  88. print *,"B"
  89. call mat_disp(B)
  90. print *,"C"
  91. call mat_disp(C)
  92. end if
  93. end if
  94. ! allocate data handles
  95. allocate(dh_A(NB))
  96. allocate(dh_B(NB))
  97. allocate(dh_C(NB,NB))
  98. ! register matrices
  99. if (comm_rank == 0) then
  100. mr = 0 ! TODO: use STARPU_MAIN_RAM constant
  101. else
  102. mr = -1
  103. end if
  104. tag = 0
  105. do b_row=1,NB
  106. if (comm_rank == 0) then
  107. call fstarpu_matrix_data_register(dh_A(b_row), mr, &
  108. c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
  109. else
  110. call fstarpu_matrix_data_register(dh_A(b_row), mr, &
  111. c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
  112. end if
  113. call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
  114. tag = tag+1
  115. end do
  116. do b_col=1,NB
  117. if (comm_rank == 0) then
  118. call fstarpu_matrix_data_register(dh_B(b_col), mr, &
  119. c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
  120. else
  121. call fstarpu_matrix_data_register(dh_B(b_col), mr, &
  122. c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
  123. end if
  124. call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
  125. tag = tag+1
  126. end do
  127. do b_col=1,NB
  128. do b_row=1,NB
  129. if (comm_rank == 0) then
  130. call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
  131. c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
  132. else
  133. call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
  134. c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
  135. end if
  136. call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
  137. tag = tag+1
  138. end do
  139. end do
  140. ! distribute matrix C
  141. do b_col=1,NB
  142. do b_row=1,NB
  143. rank = modulo(b_row+b_col, comm_size)
  144. call fstarpu_mpi_get_data_on_node(comm_world, dh_c(b_row,b_col), rank)
  145. call fstarpu_mpi_data_set_rank(dh_c(b_row,b_col), rank)
  146. end do
  147. end do
  148. do b_col=1,NB
  149. do b_row=1,NB
  150. ret = fstarpu_mpi_task_insert(comm_world, (/ cl_mm, &
  151. FSTARPU_R, dh_A(b_row), &
  152. FSTARPU_R, dh_B(b_col), &
  153. FSTARPU_RW, dh_C(b_row,b_col), &
  154. C_NULL_PTR /))
  155. end do
  156. end do
  157. call fstarpu_task_wait_for_all()
  158. ! undistribute matrix C
  159. do b_col=1,NB
  160. do b_row=1,NB
  161. call fstarpu_mpi_get_data_on_node(comm_world, dh_c(b_row,b_col), 0)
  162. call fstarpu_mpi_data_set_rank(dh_c(b_row,b_col), 0)
  163. end do
  164. end do
  165. ! unregister matrices
  166. do b_row=1,NB
  167. call fstarpu_data_unregister(dh_A(b_row))
  168. end do
  169. do b_col=1,NB
  170. call fstarpu_data_unregister(dh_B(b_col))
  171. end do
  172. do b_col=1,NB
  173. do b_row=1,NB
  174. call fstarpu_data_unregister(dh_C(b_row,b_col))
  175. end do
  176. end do
  177. ! check result
  178. if (comm_rank == 0) then
  179. if (verbose) then
  180. print *,"final C"
  181. call mat_disp(C)
  182. end if
  183. do col=1,N
  184. do row=1,N
  185. if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
  186. print *, "check failed"
  187. stop 1
  188. end if
  189. end do
  190. end do
  191. end if
  192. ! free handles
  193. deallocate(dh_A)
  194. deallocate(dh_B)
  195. deallocate(dh_C)
  196. ! free matrices
  197. if (comm_rank == 0) then
  198. deallocate(A)
  199. deallocate(B)
  200. deallocate(C)
  201. end if
  202. call fstarpu_codelet_free(cl_mm)
  203. call fstarpu_shutdown()
  204. ret = fstarpu_mpi_shutdown()
  205. print *,"fstarpu_mpi_shutdown status:", ret
  206. if (ret /= 0) then
  207. stop 1
  208. end if
  209. end program nf_mm