Browse Source

- complete native Fortran version of distributed matrix multiply example

Olivier Aumage 9 years ago
parent
commit
cce3d970fc
2 changed files with 250 additions and 0 deletions
  1. 185 0
      mpi/examples/native_fortran/nf_mm.f90
  2. 65 0
      mpi/examples/native_fortran/nf_mm_cl.f90

+ 185 - 0
mpi/examples/native_fortran/nf_mm.f90

@@ -20,8 +20,17 @@ program nf_mm
         use nf_mm_cl
         implicit none
 
+        logical, parameter :: verbose = .false.
+        integer(c_int) :: comm_rank, comm_size, comm_world
+        integer(c_int) :: N = 16, BS = 4, NB
+        real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
+        type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
+        type(c_ptr) :: cl_mm
         integer(c_int) :: ncpu
         integer(c_int) :: ret
+        integer(c_int) :: row, col
+        integer(c_int) :: b_row, b_col
+        integer(c_int) :: mr, tag, rank
 
         ret = fstarpu_mpi_init(1)
         print *,"fstarpu_mpi_init status:", ret
@@ -43,6 +52,182 @@ program nf_mm
                 stop 77
         end if
 
+        comm_world = fstarpu_mpi_world_comm()
+        comm_size = fstarpu_mpi_world_size()
+        comm_rank = fstarpu_mpi_world_rank()
+
+        if (comm_size < 2) then
+                call fstarpu_shutdown()
+                ret = fstarpu_mpi_shutdown()
+                stop 77
+        end if
+
+        ! TODO: process app's argc/argv
+        NB = N/BS
+
+        ! allocate and initialize codelet
+        cl_mm = fstarpu_codelet_allocate()
+        call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
+        call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
+
+        ! allocate matrices
+        if (comm_rank == 0) then
+                allocate(A(N,N))
+                allocate(B(N,N))
+                allocate(C(N,N))
+        end if
+
+        ! init matrices
+        if (comm_rank == 0) then
+                do col=1,N
+                do row=1,N
+                if (row == col) then
+                        A(row,col) = 2
+                else
+                        A(row,col) = 0
+                end if
+                B(row,col) = row*N+col
+                C(row,col) = 0
+                end do
+                end do
+
+                if (verbose) then
+                        print *,"A"
+                        call mat_disp(A)
+                        print *,"B"
+                        call mat_disp(B)
+                        print *,"C"
+                        call mat_disp(C)
+                end if
+        end if
+
+        ! allocate data handles
+        allocate(dh_A(NB))
+        allocate(dh_B(NB))
+        allocate(dh_C(NB,NB))
+
+        ! register matrices
+        if (comm_rank == 0) then
+                mr = 0 ! TODO: use STARPU_MAIN_RAM constant
+        else
+                mr = -1
+        end if
+        tag = 0
+
+        do b_row=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &
+                                c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &
+                                c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
+                tag = tag+1
+        end do
+
+        do b_col=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &
+                                c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &
+                                c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
+                tag = tag+1
+        end do
+
+        do b_col=1,NB
+        do b_row=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
+                                c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
+                                c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
+                tag = tag+1
+        end do
+        end do
+
+        ! distribute matrix C
+        do b_col=1,NB
+        do b_row=1,NB
+        rank = modulo(b_row+b_col, comm_size)
+        call fstarpu_mpi_get_data_on_node(comm_world, dh_c(b_row,b_col), rank)
+        call fstarpu_mpi_data_set_rank(dh_c(b_row,b_col), rank)
+        end do
+        end do
+
+        do b_col=1,NB
+        do b_row=1,NB
+                ret = fstarpu_mpi_task_insert(comm_world, (/ cl_mm, &
+                        FSTARPU_R,  dh_A(b_row), &
+                        FSTARPU_R,  dh_B(b_col), &
+                        FSTARPU_RW, dh_C(b_row,b_col), &
+                        C_NULL_PTR /))
+        end do
+        end do
+
+        call fstarpu_task_wait_for_all()
+
+        ! undistribute matrix C
+        do b_col=1,NB
+        do b_row=1,NB
+        call fstarpu_mpi_get_data_on_node(comm_world, dh_c(b_row,b_col), 0)
+        call fstarpu_mpi_data_set_rank(dh_c(b_row,b_col), 0)
+        end do
+        end do
+
+        ! unregister matrices
+        do b_row=1,NB
+                call fstarpu_data_unregister(dh_A(b_row))
+        end do
+
+        do b_col=1,NB
+                call fstarpu_data_unregister(dh_B(b_col))
+        end do
+
+        do b_col=1,NB
+        do b_row=1,NB
+                call fstarpu_data_unregister(dh_C(b_row,b_col))
+        end do
+        end do
+
+        ! check result
+        if (comm_rank == 0) then
+                if (verbose) then
+                        print *,"final C"
+                        call mat_disp(C)
+                end if
+
+                do col=1,N
+                do row=1,N
+                if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
+                        print *, "check failed"
+                        stop 1
+                end if
+                end do
+                end do
+        end if
+
+        ! free handles
+        deallocate(dh_A)
+        deallocate(dh_B)
+        deallocate(dh_C)
+
+        ! free matrices
+        if (comm_rank == 0) then
+                deallocate(A)
+                deallocate(B)
+                deallocate(C)
+        end if
+        call fstarpu_codelet_free(cl_mm)
         call fstarpu_shutdown()
 
         ret = fstarpu_mpi_shutdown()

+ 65 - 0
mpi/examples/native_fortran/nf_mm_cl.f90

@@ -15,11 +15,76 @@
 
 module nf_mm_cl
 contains
+subroutine mat_disp (m)
+        ! declared here so it can be used both for the
+        ! program and for debugging codelet routines
+
+        use iso_c_binding       ! C interfacing module
+        implicit none
+        real(kind=c_double) :: m(:,:)
+        integer i,j
+
+        do i=lbound(m,1),ubound(m,1)
+                write(*, fmt="(A2) ",advance="no") "| "
+        do j=lbound(m,2),ubound(m,2)
+                write(*, fmt="(F6.1,A1) ", advance="no") m(i,j)," "
+        end do
+                write(*,*) "|"
+        end do
+        write(*,*)
+
+end subroutine
+
 recursive subroutine cl_cpu_mult (buffers, cl_args) bind(C)
         use iso_c_binding       ! C interfacing module
         use fstarpu_mod         ! StarPU interfacing module
         implicit none
 
         type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+        real(kind=c_double),pointer :: A(:,:), B(:,:), C(:,:)
+        integer :: ld_A,nx_A,ny_A
+        integer :: ld_B,nx_B,ny_B
+        integer :: ld_C,nx_C,ny_C
+        integer :: i,j,k
+
+        ld_A = fstarpu_matrix_get_ld(buffers, 0)
+        ld_B = fstarpu_matrix_get_ld(buffers, 1)
+        ld_C = fstarpu_matrix_get_ld(buffers, 2)
+
+        nx_A = fstarpu_matrix_get_nx(buffers, 0)
+        nx_B = fstarpu_matrix_get_nx(buffers, 1)
+        nx_C = fstarpu_matrix_get_nx(buffers, 2)
+
+        ny_A = fstarpu_matrix_get_ny(buffers, 0)
+        ny_B = fstarpu_matrix_get_ny(buffers, 1)
+        ny_C = fstarpu_matrix_get_ny(buffers, 2)
+
+        if (ny_C /= ny_B) then
+                write(*,*) "C -- B column mismatch"
+                stop 1
+        end if
+
+        if (nx_C /= nx_A) then
+                write(*,*) "C -- A row mismatch"
+                stop 1
+        end if
+
+        if (ny_A /= nx_B) then
+                write(*,*) "A -- B col/row mismatch"
+                stop 1
+        end if
+
+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 0), A, shape=[ld_A,ny_A])
+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 1), B, shape=[ld_B,ny_B])
+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 2), C, shape=[ld_C,ny_C])
+
+        do k = 1, ny_C
+        do j = 1, nx_C
+        do i = 1, nx_B
+                C(j,k) = C(j,k) + A(j,i) * B(i,k)
+        end do
+        end do
+        end do
+
 end subroutine cl_cpu_mult
 end module nf_mm_cl