# StarPU --- Runtime system for heterogeneous multicore architectures.
#
# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
#
# StarPU is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or (at
# your option) any later version.
#
# StarPU is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the GNU Lesser General Public License in COPYING.LGPL for more details.
#
using StarPU
using LinearAlgebra.BLAS

# Standard kernels for the Cholesky factorization
# U22 is the gemm update
# U21 is the trsm update
# U11 is the cholesky factorization

@target STARPU_CPU+STARPU_CUDA
@codelet function u11(sub11 :: Matrix{Float32}) :: Nothing
    nx :: Int32 = width(sub11)
    ld :: Int32 = ld(sub11)

    for z in 0:nx-1
        lambda11 :: Float32 = sqrt(sub11[z+1,z+1])
        sub11[z+1,z+1] = lambda11

        alpha ::Float32 = 1.0f0 / lambda11
        X :: Vector{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+1)
        STARPU_SSCAL(nx-z-1, alpha, X, 1)

        alpha = -1.0f0
        A :: Matrix{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+2:z+2+(nx-z-2))
	STARPU_SSYR("L", nx-z-1, alpha, X, 1, A, ld)
    end
    return
end

@target STARPU_CPU+STARPU_CUDA
@codelet function u21(sub11 :: Matrix{Float32},
                      sub21 :: Matrix{Float32}) :: Nothing
    ld11 :: Int32 = ld(sub11)
    ld21 :: Int32 = ld(sub21)
    nx21 :: Int32 = width(sub21)
    ny21 :: Int32 = height(sub21)
    alpha :: Float32 = 1.0f0
    STARPU_STRSM("R", "L", "T", "N", nx21, ny21, alpha, sub11, ld11, sub21, ld21)
    return
end

@target STARPU_CPU+STARPU_CUDA
@codelet function u22(left   :: Matrix{Float32},
                      right  :: Matrix{Float32},
                      center :: Matrix{Float32}) :: Nothing
    dx :: Int32 = width(center)
    dy :: Int32 = height(center)
    dz :: Int32 = width(left)
    ld21 :: Int32 = ld(left)
    ld12 :: Int32 = ld(center)
    ld22 :: Int32 = ld(right)
    alpha :: Float32 = -1.0f0
    beta :: Float32 = 1.0f0
    STARPU_SGEMM("N", "T", dy, dx, dz, alpha, left, ld21, right, ld12, beta, center, ld22)
    return
end

function cholesky(mat :: Matrix{Float32}, size, nblocks)
    perfmodel = starpu_perfmodel(
        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
        symbol = "history_perf"
    )
    cl_11 = starpu_codelet(
        cpu_func = CPU_CODELETS["u11"],
        # This kernel cannot be translated to CUDA yet.
        # cuda_func = CUDA_CODELETS["u11"],
        modes = [STARPU_RW],
        color = 0xffff00,
        perfmodel = perfmodel
    )
    cl_21 = starpu_codelet(
        cpu_func = CPU_CODELETS["u21"],
        # cuda_func = CUDA_CODELETS["u21"],
        modes = [STARPU_R, STARPU_RW],
        color = 0x8080ff,
        perfmodel = perfmodel
    )
    cl_22 = starpu_codelet(
        cpu_func = CPU_CODELETS["u22"],
        # cuda_func = CUDA_CODELETS["u22"],
        modes = [STARPU_R, STARPU_R, STARPU_RW],
        color = 0x00ff00,
        perfmodel = perfmodel
    )

    horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
    vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)

    @starpu_block let
        h_mat = starpu_data_register(mat)
        starpu_data_map_filters(h_mat, horiz, vert)

        for k in 1:nblocks

            starpu_iteration_push(k)

            task = starpu_task(cl = cl_11, handles = [h_mat[k, k]])
            starpu_task_submit(task)

            for m in k+1:nblocks
                task = starpu_task(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]])
                starpu_task_submit(task)
            end

            for m in k+1:nblocks
                for n in k+1:nblocks
                    if n <= m
                        task = starpu_task(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]])
                        starpu_task_submit(task)
                    end
                end
            end

            starpu_iteration_pop()
        end

        starpu_task_wait_for_all()
    end
end

function check(mat::Matrix{Float32})
    size_p = size(mat, 1)

    for i in 1:size_p
        for j in 1:size_p
            if j > i
                mat[i, j] = 0.0f0
            end
        end
    end

    test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)

    syrk!('L', 'N', 1.0f0, mat, 0.0f0, test_mat)

    for i in 1:size_p
        for j in 1:size_p
            if j <= i
                orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
                err = abs(test_mat[i,j] - orig) / orig
                if err > 0.0001
                    got = test_mat[i,j]
                    expected = orig
                    error("[$i, $j] -> $got != $expected (err $err)")
                end
            end
        end
    end

    println("Verification successful !")
end

function main(size_p :: Int, nblocks :: Int, verbose = false)
    starpu_init()

    mat :: Matrix{Float32} = zeros(Float32, size_p, size_p)

    # create a simple definite positive symetric matrix
    # Hilbert matrix h(i,j) = 1/(i+j+1)

    for i in 1:size_p
        for j in 1:size_p
            mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
        end
    end

    if verbose
        display(mat)
    end

    starpu_memory_pin(mat)

    t_start = time_ns()

    cholesky(mat, size_p, nblocks)

    t_end = time_ns()

    starpu_memory_unpin(mat)

    flop = (1.0*size_p*size_p*size_p)/3.0
    println("# size\tms\tGFlops")
    time_ms = (t_end-t_start) / 1e6
    gflops = flop/(time_ms*1000)/1000
    println("# $size_p\t$time_ms\t$gflops")

    if verbose
        display(mat)
    end

    check(mat)

    starpu_shutdown()
end

main(1024, 8)