123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- ! StarPU --- Runtime system for heterogeneous multicore architectures.
- !
- ! Copyright (C) 2016 Inria
- !
- ! StarPU is free software; you can redistribute it and/or modify
- ! it under the terms of the GNU Lesser General Public License as published by
- ! the Free Software Foundation; either version 2.1 of the License, or (at
- ! your option) any later version.
- !
- ! StarPU is distributed in the hope that it will be useful, but
- ! WITHOUT ANY WARRANTY; without even the implied warranty of
- ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- !
- ! See the GNU Lesser General Public License in COPYING.LGPL for more details.
- program nf_mm
- use iso_c_binding ! C interfacing module
- use fstarpu_mod ! StarPU interfacing module
- use fstarpu_mpi_mod ! StarPU-MPI interfacing module
- use nf_mm_cl
- implicit none
- logical, parameter :: verbose = .false.
- integer(c_int) :: comm_rank, comm_size, comm_world
- integer(c_int) :: N = 16, BS = 4, NB
- real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
- type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
- type(c_ptr) :: cl_mm
- integer(c_int) :: ncpu
- integer(c_int) :: ret
- integer(c_int) :: row, col
- integer(c_int) :: b_row, b_col
- integer(c_int) :: mr, tag, rank
- ret = fstarpu_init(C_NULL_PTR)
- if (ret == -19) then
- stop 77
- else if (ret /= 0) then
- stop 1
- end if
- ret = fstarpu_mpi_init(1)
- print *,"fstarpu_mpi_init status:", ret
- if (ret /= 0) then
- stop 1
- end if
- ! stop there if no CPU worker available
- ncpu = fstarpu_cpu_worker_get_count()
- if (ncpu == 0) then
- call fstarpu_shutdown()
- stop 77
- end if
- comm_world = fstarpu_mpi_world_comm()
- comm_size = fstarpu_mpi_world_size()
- comm_rank = fstarpu_mpi_world_rank()
- if (comm_size < 2) then
- call fstarpu_shutdown()
- ret = fstarpu_mpi_shutdown()
- stop 77
- end if
- ! TODO: process app's argc/argv
- NB = N/BS
- ! allocate and initialize codelet
- cl_mm = fstarpu_codelet_allocate()
- call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
- call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
- call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
- call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
- call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
- ! allocate matrices
- if (comm_rank == 0) then
- allocate(A(N,N))
- allocate(B(N,N))
- allocate(C(N,N))
- end if
- ! init matrices
- if (comm_rank == 0) then
- do col=1,N
- do row=1,N
- if (row == col) then
- A(row,col) = 2
- else
- A(row,col) = 0
- end if
- B(row,col) = row*N+col
- C(row,col) = 0
- end do
- end do
- if (verbose) then
- print *,"A"
- call mat_disp(A)
- print *,"B"
- call mat_disp(B)
- print *,"C"
- call mat_disp(C)
- end if
- end if
- ! allocate data handles
- allocate(dh_A(NB))
- allocate(dh_B(NB))
- allocate(dh_C(NB,NB))
- ! register matrices
- if (comm_rank == 0) then
- mr = 0 ! TODO: use STARPU_MAIN_RAM constant
- else
- mr = -1
- end if
- tag = 0
- do b_row=1,NB
- if (comm_rank == 0) then
- call fstarpu_matrix_data_register(dh_A(b_row), mr, &
- c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
- else
- call fstarpu_matrix_data_register(dh_A(b_row), mr, &
- c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
- end if
- call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
- tag = tag+1
- end do
- do b_col=1,NB
- if (comm_rank == 0) then
- call fstarpu_matrix_data_register(dh_B(b_col), mr, &
- c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
- else
- call fstarpu_matrix_data_register(dh_B(b_col), mr, &
- c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
- end if
- call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
- tag = tag+1
- end do
- do b_col=1,NB
- do b_row=1,NB
- if (comm_rank == 0) then
- call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
- c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
- else
- call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
- c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
- end if
- call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
- tag = tag+1
- end do
- end do
- ! distribute matrix C
- do b_col=1,NB
- do b_row=1,NB
- rank = modulo(b_row+b_col, comm_size)
- call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), rank)
- end do
- end do
- do b_col=1,NB
- do b_row=1,NB
- ret = fstarpu_mpi_task_insert(comm_world, (/ cl_mm, &
- FSTARPU_R, dh_A(b_row), &
- FSTARPU_R, dh_B(b_col), &
- FSTARPU_RW, dh_C(b_row,b_col), &
- C_NULL_PTR /))
- end do
- end do
- call fstarpu_task_wait_for_all()
- ! undistribute matrix C
- do b_col=1,NB
- do b_row=1,NB
- call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), 0)
- end do
- end do
- ! unregister matrices
- do b_row=1,NB
- call fstarpu_data_unregister(dh_A(b_row))
- end do
- do b_col=1,NB
- call fstarpu_data_unregister(dh_B(b_col))
- end do
- do b_col=1,NB
- do b_row=1,NB
- call fstarpu_data_unregister(dh_C(b_row,b_col))
- end do
- end do
- ! check result
- if (comm_rank == 0) then
- if (verbose) then
- print *,"final C"
- call mat_disp(C)
- end if
- do col=1,N
- do row=1,N
- if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
- print *, "check failed"
- stop 1
- end if
- end do
- end do
- end if
- ! free handles
- deallocate(dh_A)
- deallocate(dh_B)
- deallocate(dh_C)
- ! free matrices
- if (comm_rank == 0) then
- deallocate(A)
- deallocate(B)
- deallocate(C)
- end if
- call fstarpu_codelet_free(cl_mm)
- call fstarpu_shutdown()
- ret = fstarpu_mpi_shutdown()
- print *,"fstarpu_mpi_shutdown status:", ret
- if (ret /= 0) then
- stop 1
- end if
- end program nf_mm
|