| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 | ! StarPU --- Runtime system for heterogeneous multicore architectures.!! Copyright (C) 2017, 2019                               CNRS! Copyright (C) 2016                                     Inria! Copyright (C) 2016                                     Université de Bordeaux!! StarPU is free software; you can redistribute it and/or modify! it under the terms of the GNU Lesser General Public License as published by! the Free Software Foundation; either version 2.1 of the License, or (at! your option) any later version.!! StarPU is distributed in the hope that it will be useful, but! WITHOUT ANY WARRANTY; without even the implied warranty of! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.!! See the GNU Lesser General Public License in COPYING.LGPL for more details.!program nf_mm        use iso_c_binding       ! C interfacing module        use fstarpu_mod         ! StarPU interfacing module        use fstarpu_mpi_mod     ! StarPU-MPI interfacing module        use nf_mm_cl        implicit none        logical, parameter :: verbose = .false.        integer(c_int) :: comm_size, comm_rank        integer(c_int), target :: comm_world        integer(c_int) :: N = 16, BS = 4, NB        real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)        type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)        type(c_ptr) :: cl_mm        integer(c_int) :: ncpu        integer(c_int) :: ret        integer(c_int) :: row, col        integer(c_int) :: b_row, b_col        integer(c_int) :: mr, tag, rank        ret = fstarpu_init(C_NULL_PTR)        if (ret == -19) then                stop 77        else if (ret /= 0) then                stop 1        end if        ret = fstarpu_mpi_init(1)        print *,"fstarpu_mpi_init status:", ret        if (ret /= 0) then                stop 1        end if        ! stop there if no CPU worker available        ncpu = fstarpu_cpu_worker_get_count()        if (ncpu == 0) then                call fstarpu_shutdown()                stop 77        end if        comm_world = fstarpu_mpi_world_comm()        comm_size = fstarpu_mpi_world_size()        comm_rank = fstarpu_mpi_world_rank()        if (comm_size < 2) then                call fstarpu_shutdown()                ret = fstarpu_mpi_shutdown()                stop 77        end if        ! TODO: process app's argc/argv        NB = N/BS        ! allocate and initialize codelet        cl_mm = fstarpu_codelet_allocate()        call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)        call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)        ! allocate matrices        if (comm_rank == 0) then                allocate(A(N,N))                allocate(B(N,N))                allocate(C(N,N))        end if        ! init matrices        if (comm_rank == 0) then                do col=1,N                do row=1,N                if (row == col) then                        A(row,col) = 2                else                        A(row,col) = 0                end if                B(row,col) = row*N+col                C(row,col) = 0                end do                end do                if (verbose) then                        print *,"A"                        call mat_disp(A)                        print *,"B"                        call mat_disp(B)                        print *,"C"                        call mat_disp(C)                end if        end if        ! allocate data handles        allocate(dh_A(NB))        allocate(dh_B(NB))        allocate(dh_C(NB,NB))        ! register matrices        if (comm_rank == 0) then                mr = 0 ! TODO: use STARPU_MAIN_RAM constant        else                mr = -1        end if        tag = 0        do b_row=1,NB                if (comm_rank == 0) then                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &                                c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))                else                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &                                c_null_ptr, N, BS, N, c_sizeof(A(1,1)))                end if                call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)                tag = tag+1        end do        do b_col=1,NB                if (comm_rank == 0) then                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &                                c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))                else                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &                                c_null_ptr, N, N, BS, c_sizeof(B(1,1)))                end if                call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)                tag = tag+1        end do        do b_col=1,NB        do b_row=1,NB                if (comm_rank == 0) then                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &                                c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))                else                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &                                c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))                end if                call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)                tag = tag+1        end do        end do        ! distribute matrix C        do b_col=1,NB        do b_row=1,NB        rank = modulo(b_row+b_col, comm_size)        call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), rank)        end do        end do        do b_col=1,NB           do b_row=1,NB              call fstarpu_mpi_task_insert((/ c_loc(comm_world), cl_mm, &                   FSTARPU_R,  dh_A(b_row), &                   FSTARPU_R,  dh_B(b_col), &                   FSTARPU_RW, dh_C(b_row,b_col), &                   C_NULL_PTR /))           end do        end do        call fstarpu_task_wait_for_all()        ! undistribute matrix C        do b_col=1,NB        do b_row=1,NB        call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), 0)        end do        end do        ! unregister matrices        do b_row=1,NB                call fstarpu_data_unregister(dh_A(b_row))        end do        do b_col=1,NB                call fstarpu_data_unregister(dh_B(b_col))        end do        do b_col=1,NB        do b_row=1,NB                call fstarpu_data_unregister(dh_C(b_row,b_col))        end do        end do        ! check result        if (comm_rank == 0) then                if (verbose) then                        print *,"final C"                        call mat_disp(C)                end if                do col=1,N                do row=1,N                if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then                        print *, "check failed"                        stop 1                end if                end do                end do        end if        ! free handles        deallocate(dh_A)        deallocate(dh_B)        deallocate(dh_C)        ! free matrices        if (comm_rank == 0) then                deallocate(A)                deallocate(B)                deallocate(C)        end if        call fstarpu_codelet_free(cl_mm)        call fstarpu_shutdown()        ret = fstarpu_mpi_shutdown()        print *,"fstarpu_mpi_shutdown status:", ret        if (ret /= 0) then                stop 1        end ifend program nf_mm
 |