! StarPU --- Runtime system for heterogeneous multicore architectures.
!
! Copyright (C) 2015  ONERA
! Copyright (C) 2015-2016  Inria
!
! StarPU is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at
! your option) any later version.
!
! StarPU is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
!
! See the GNU Lesser General Public License in COPYING.LGPL for more details.

! This is an example of Fortran90 program making use of StarPU.
! It registers a few matrices for each element of a domain, performs
! update computations on them, and checks the result.

PROGRAM f90_example

  USE nf_types
  USE fstarpu_mod
  USE nf_compute
  USE iso_c_binding

  IMPLICIT NONE

  TYPE(type_mesh)                :: mesh
  TYPE(type_numpar),TARGET       :: numpar
  TYPE(type_mesh_elt),POINTER    :: elt   => NULL()
  INTEGER(KIND=C_INT)            :: i,Nelt,res,cpus
  INTEGER(KIND=C_INT)            :: starpu_maj,starpu_min,starpu_rev
  INTEGER(KIND=C_INT)            :: neq,ng,nb,it,it_tot
  REAL(KIND=C_DOUBLE)            :: r, coeff2

  TYPE(C_PTR) :: cl_loop_element = C_NULL_PTR ! loop codelet
  TYPE(C_PTR) :: cl_copy_element = C_NULL_PTR ! copy codelet

  !Initialization with arbitrary data
  Nelt           = 2
  it_tot = 2
  numpar%Neq_max = 5
  numpar%coeff   = 1.0
  ALLOCATE(mesh%elt(Nelt))
  DO i = 1,Nelt
     elt => mesh%elt(i)
     elt%Ng  = 4
     elt%Np  = 2
     ALLOCATE(elt%ro(numpar%Neq_max,elt%Np))
     ALLOCATE(elt%dro(numpar%Neq_max,elt%Np))
     ALLOCATE(elt%basis(elt%Np,elt%Ng))
     CALL init_element(elt%ro,elt%dro,elt%basis,numpar%Neq_max,elt%Np,elt%Ng,i)
  ENDDO

  !Initialization of StarPU
  res = fstarpu_init(C_NULL_PTR)
  IF (res == -19) THEN
     STOP 77
  END IF
  CALL fstarpu_get_version(starpu_maj,starpu_min,starpu_rev)
  WRITE(6,'(a,i4,a,i4,a,i4)')      "StarPU version: ", starpu_maj , "." , starpu_min , "." , starpu_rev
  cpus = fstarpu_cpu_worker_get_count()
  IF (cpus == 0) THEN
     CALL fstarpu_shutdown()
     STOP 77
  END IF

  cl_loop_element = fstarpu_codelet_allocate()
  CALL fstarpu_codelet_add_cpu_func(cl_loop_element, C_FUNLOC(loop_element_cpu_fortran))
  CALL fstarpu_codelet_add_buffer(cl_loop_element, FSTARPU_R)
  CALL fstarpu_codelet_add_buffer(cl_loop_element, FSTARPU_RW)
  CALL fstarpu_codelet_add_buffer(cl_loop_element, FSTARPU_R)
  CALL fstarpu_codelet_set_name(cl_loop_element, C_CHAR_"LOOP_ELEMENT"//C_NULL_CHAR)

  cl_copy_element = fstarpu_codelet_allocate()
  CALL fstarpu_codelet_add_cpu_func(cl_copy_element, C_FUNLOC(copy_element_cpu_fortran))
  CALL fstarpu_codelet_add_buffer(cl_copy_element, FSTARPU_RW)
  CALL fstarpu_codelet_add_buffer(cl_copy_element, FSTARPU_R)
  CALL fstarpu_codelet_set_name(cl_copy_element, C_CHAR_"COPY_ELEMENT"//C_NULL_CHAR)

  !Registration of elements
  DO i = 1,Nelt
     elt => mesh%elt(i)
     call fstarpu_matrix_data_register(elt%ro_h, 0, c_loc(elt%ro), numpar%Neq_max, numpar%Neq_max, elt%Np, c_sizeof(elt%ro(1,1)))
     call fstarpu_matrix_data_register(elt%dro_h, 0, c_loc(elt%dro), numpar%Neq_max, numpar%Neq_max, elt%Np, c_sizeof(elt%dro(1,1)))
     call fstarpu_matrix_data_register(elt%basis_h, 0, c_loc(elt%basis), elt%Np, elt%Np, elt%Ng, c_sizeof(elt%basis(1,1)))
  ENDDO
  !Compute
  DO it = 1,it_tot

     ! compute new dro for each element
     DO i = 1,Nelt
        elt => mesh%elt(i)
        CALL fstarpu_insert_task((/ cl_loop_element,    &
                FSTARPU_VALUE, c_loc(numpar%coeff), FSTARPU_SZ_C_DOUBLE, &
                FSTARPU_R, elt%ro_h,                 &
                FSTARPU_RW, elt%dro_h,                &
                FSTARPU_R, elt%basis_h,              &
                C_NULL_PTR /))
     ENDDO
     ! sync (if needed by the algorithm)
     CALL fstarpu_task_wait_for_all()

     ! - - - - -

     ! copy dro to ro for each element
     DO i = 1,Nelt
        elt => mesh%elt(i)
        CALL fstarpu_insert_task((/ cl_copy_element,    &
                FSTARPU_RW, elt%ro_h,                 &
                FSTARPU_R, elt%dro_h,                &
                C_NULL_PTR /))
     ENDDO
     ! sync (if needed by the algorithm)
     CALL fstarpu_task_wait_for_all()

  ENDDO
  !Unregistration of elements
  DO i = 1,Nelt
     elt => mesh%elt(i)
     CALL fstarpu_data_unregister(elt%ro_h)
     CALL fstarpu_data_unregister(elt%dro_h)
     CALL fstarpu_data_unregister(elt%basis_h)
  ENDDO

  !Terminate StarPU, no task can be submitted after
  CALL fstarpu_shutdown()

  !Check data with StarPU
  WRITE(6,'(a)') " "
  WRITE(6,'(a)') " %%%% RESULTS STARPU %%%% "
  WRITE(6,'(a)') " "
  DO i = 1,Nelt
     WRITE(6,'(a,i4,a)')      " elt ", i , "  ;  elt%ro = "
     WRITE(6,'(10(1x,F11.2))') mesh%elt(i)%ro
     WRITE(6,'(a)')           " ------------------------ "
  ENDDO

  !Same compute without StarPU
  DO i = 1,Nelt
     elt => mesh%elt(i)
     CALL init_element(elt%ro,elt%dro,elt%basis,numpar%Neq_max,elt%Np,elt%Ng,i)
  ENDDO

  DO it = 1, it_tot
     DO i = 1,Nelt
        elt => mesh%elt(i)
        CALL loop_element_cpu(elt%ro,elt%dro,elt%basis,numpar%coeff,numpar%Neq_max,elt%Ng,elt%Np)
        elt%ro = elt%ro + elt%dro
     ENDDO
  ENDDO

  WRITE(6,'(a)') " "
  WRITE(6,'(a)') " %%%% RESULTS VERIFICATION %%%% "
  WRITE(6,'(a)') " "

  DO i = 1,Nelt
     WRITE(6,'(a,i4,a)')      " elt ", i , "  ;  elt%ro = "
     WRITE(6,'(10(1x,F11.2))') mesh%elt(i)%ro
     WRITE(6,'(a)')           " ------------------------ "
  ENDDO

  WRITE(6,'(a)') " "

  !Deallocation
  CALL fstarpu_codelet_free(cl_loop_element)
  CALL fstarpu_codelet_free(cl_copy_element)
  DO i = 1,Nelt
     elt => mesh%elt(i)
     DEALLOCATE(elt%ro)
     DEALLOCATE(elt%dro)
     DEALLOCATE(elt%basis)
  ENDDO
  DEALLOCATE(mesh%elt)

END PROGRAM f90_example