|
@@ -0,0 +1,210 @@
|
|
|
+# StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
+#
|
|
|
+# Copyright (C) 2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
|
|
|
+#
|
|
|
+# StarPU is free software; you can redistribute it and/or modify
|
|
|
+# it under the terms of the GNU Lesser General Public License as published by
|
|
|
+# the Free Software Foundation; either version 2.1 of the License, or (at
|
|
|
+# your option) any later version.
|
|
|
+#
|
|
|
+# StarPU is distributed in the hope that it will be useful, but
|
|
|
+# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
+#
|
|
|
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
|
|
|
+#
|
|
|
+using StarPU
|
|
|
+using LinearAlgebra.BLAS
|
|
|
+
|
|
|
+# Standard kernels for the Cholesky factorization
|
|
|
+# U22 is the gemm update
|
|
|
+# U21 is the trsm update
|
|
|
+# U11 is the cholesky factorization
|
|
|
+
|
|
|
+@target STARPU_CPU+STARPU_CUDA
|
|
|
+@codelet function u11(sub11 :: Matrix{Float32}) :: Nothing
|
|
|
+ nx :: Int32 = width(sub11)
|
|
|
+ ld :: Int32 = ld(sub11)
|
|
|
+
|
|
|
+ for z in 0:nx-1
|
|
|
+ lambda11 :: Float32 = sqrt(sub11[z+1,z+1])
|
|
|
+ sub11[z+1,z+1] = lambda11
|
|
|
+
|
|
|
+ alpha ::Float32 = 1.0f0 / lambda11
|
|
|
+ X :: Vector{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+1)
|
|
|
+ STARPU_SSCAL(nx-z-1, alpha, X, 1)
|
|
|
+
|
|
|
+ alpha = -1.0f0
|
|
|
+ A :: Matrix{Float32} = view(sub11, z+2:z+2+(nx-z-2), z+2:z+2+(nx-z-2))
|
|
|
+ STARPU_SSYR("L", nx-z-1, alpha, X, 1, A, ld)
|
|
|
+ end
|
|
|
+ return
|
|
|
+end
|
|
|
+
|
|
|
+@target STARPU_CPU+STARPU_CUDA
|
|
|
+@codelet function u21(sub11 :: Matrix{Float32},
|
|
|
+ sub21 :: Matrix{Float32}) :: Nothing
|
|
|
+ ld11 :: Int32 = ld(sub11)
|
|
|
+ ld21 :: Int32 = ld(sub21)
|
|
|
+ nx21 :: Int32 = width(sub21)
|
|
|
+ ny21 :: Int32 = height(sub21)
|
|
|
+ alpha :: Float32 = 1.0f0
|
|
|
+ STARPU_STRSM("R", "L", "T", "N", nx21, ny21, alpha, sub11, ld11, sub21, ld21)
|
|
|
+ return
|
|
|
+end
|
|
|
+
|
|
|
+@target STARPU_CPU+STARPU_CUDA
|
|
|
+@codelet function u22(left :: Matrix{Float32},
|
|
|
+ right :: Matrix{Float32},
|
|
|
+ center :: Matrix{Float32}) :: Nothing
|
|
|
+ dx :: Int32 = width(center)
|
|
|
+ dy :: Int32 = height(center)
|
|
|
+ dz :: Int32 = width(left)
|
|
|
+ ld21 :: Int32 = ld(left)
|
|
|
+ ld12 :: Int32 = ld(center)
|
|
|
+ ld22 :: Int32 = ld(right)
|
|
|
+ alpha :: Float32 = -1.0f0
|
|
|
+ beta :: Float32 = 1.0f0
|
|
|
+ STARPU_SGEMM("N", "T", dy, dx, dz, alpha, left, ld21, right, ld12, beta, center, ld22)
|
|
|
+ return
|
|
|
+end
|
|
|
+
|
|
|
+function cholesky(mat :: Matrix{Float32}, size, nblocks)
|
|
|
+ perfmodel = starpu_perfmodel(
|
|
|
+ perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
|
|
|
+ symbol = "history_perf"
|
|
|
+ )
|
|
|
+ cl_11 = starpu_codelet(
|
|
|
+ cpu_func = CPU_CODELETS["u11"],
|
|
|
+ # This kernel cannot be translated to CUDA yet.
|
|
|
+ # cuda_func = CUDA_CODELETS["u11"],
|
|
|
+ modes = [STARPU_RW],
|
|
|
+ color = 0xffff00,
|
|
|
+ perfmodel = perfmodel
|
|
|
+ )
|
|
|
+ cl_21 = starpu_codelet(
|
|
|
+ cpu_func = CPU_CODELETS["u21"],
|
|
|
+ # cuda_func = CUDA_CODELETS["u21"],
|
|
|
+ modes = [STARPU_R, STARPU_RW],
|
|
|
+ color = 0x8080ff,
|
|
|
+ perfmodel = perfmodel
|
|
|
+ )
|
|
|
+ cl_22 = starpu_codelet(
|
|
|
+ cpu_func = CPU_CODELETS["u22"],
|
|
|
+ # cuda_func = CUDA_CODELETS["u22"],
|
|
|
+ modes = [STARPU_R, STARPU_R, STARPU_RW],
|
|
|
+ color = 0x00ff00,
|
|
|
+ perfmodel = perfmodel
|
|
|
+ )
|
|
|
+
|
|
|
+ horiz = starpu_data_filter(STARPU_MATRIX_FILTER_BLOCK, nblocks)
|
|
|
+ vert = starpu_data_filter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nblocks)
|
|
|
+
|
|
|
+ @starpu_block let
|
|
|
+ h_mat = starpu_data_register(mat)
|
|
|
+ starpu_data_map_filters(h_mat, horiz, vert)
|
|
|
+
|
|
|
+ for k in 1:nblocks
|
|
|
+
|
|
|
+ starpu_iteration_push(k)
|
|
|
+
|
|
|
+ task = starpu_task(cl = cl_11, handles = [h_mat[k, k]])
|
|
|
+ starpu_task_submit(task)
|
|
|
+
|
|
|
+ for m in k+1:nblocks
|
|
|
+ task = starpu_task(cl = cl_21, handles = [h_mat[k, k], h_mat[m, k]])
|
|
|
+ starpu_task_submit(task)
|
|
|
+ end
|
|
|
+
|
|
|
+ for m in k+1:nblocks
|
|
|
+ for n in k+1:nblocks
|
|
|
+ if n <= m
|
|
|
+ task = starpu_task(cl = cl_22, handles = [h_mat[m, k], h_mat[n, k], h_mat[m, n]])
|
|
|
+ starpu_task_submit(task)
|
|
|
+ end
|
|
|
+ end
|
|
|
+ end
|
|
|
+
|
|
|
+ starpu_iteration_pop()
|
|
|
+ end
|
|
|
+
|
|
|
+ starpu_task_wait_for_all()
|
|
|
+ end
|
|
|
+end
|
|
|
+
|
|
|
+function check(mat::Matrix{Float32})
|
|
|
+ size_p = size(mat, 1)
|
|
|
+
|
|
|
+ for i in 1:size_p
|
|
|
+ for j in 1:size_p
|
|
|
+ if j > i
|
|
|
+ mat[i, j] = 0.0f0
|
|
|
+ end
|
|
|
+ end
|
|
|
+ end
|
|
|
+
|
|
|
+ test_mat ::Matrix{Float32} = zeros(Float32, size_p, size_p)
|
|
|
+
|
|
|
+ syrk!('L', 'N', 1.0f0, mat, 0.0f0, test_mat)
|
|
|
+
|
|
|
+ for i in 1:size_p
|
|
|
+ for j in 1:size_p
|
|
|
+ if j <= i
|
|
|
+ orig = (1.0f0/(1.0f0+(i-1)+(j-1))) + ((i == j) ? 1.0f0*size_p : 0.0f0)
|
|
|
+ err = abs(test_mat[i,j] - orig) / orig
|
|
|
+ if err > 0.0001
|
|
|
+ got = test_mat[i,j]
|
|
|
+ expected = orig
|
|
|
+ error("[$i, $j] -> $got != $expected (err $err)")
|
|
|
+ end
|
|
|
+ end
|
|
|
+ end
|
|
|
+ end
|
|
|
+
|
|
|
+ println("Verification successful !")
|
|
|
+end
|
|
|
+
|
|
|
+function main(size_p :: Int, nblocks :: Int, verbose = false)
|
|
|
+ starpu_init()
|
|
|
+
|
|
|
+ mat :: Matrix{Float32} = zeros(Float32, size_p, size_p)
|
|
|
+
|
|
|
+ # create a simple definite positive symetric matrix
|
|
|
+ # Hilbert matrix h(i,j) = 1/(i+j+1)
|
|
|
+
|
|
|
+ for i in 1:size_p
|
|
|
+ for j in 1:size_p
|
|
|
+ mat[i, j] = 1.0f0 / (1.0f0+(i-1)+(j-1)) + ((i == j) ? 1.0f0*size_p : 0.0f0)
|
|
|
+ end
|
|
|
+ end
|
|
|
+
|
|
|
+ if verbose
|
|
|
+ display(mat)
|
|
|
+ end
|
|
|
+
|
|
|
+ starpu_memory_pin(mat)
|
|
|
+
|
|
|
+ t_start = time_ns()
|
|
|
+
|
|
|
+ cholesky(mat, size_p, nblocks)
|
|
|
+
|
|
|
+ t_end = time_ns()
|
|
|
+
|
|
|
+ starpu_memory_unpin(mat)
|
|
|
+
|
|
|
+ flop = (1.0*size_p*size_p*size_p)/3.0
|
|
|
+ println("# size\tms\tGFlops")
|
|
|
+ time_ms = (t_end-t_start) / 1e6
|
|
|
+ gflops = flop/(time_ms*1000)/1000
|
|
|
+ println("# $size_p\t$time_ms\t$gflops")
|
|
|
+
|
|
|
+ if verbose
|
|
|
+ display(mat)
|
|
|
+ end
|
|
|
+
|
|
|
+ check(mat)
|
|
|
+
|
|
|
+ starpu_shutdown()
|
|
|
+end
|
|
|
+
|
|
|
+main(1024, 8)
|