/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2009-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria * Copyright (C) 2010 Mehdi Juhoor * Copyright (C) 2013 Thibaut Lambert * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ /* * This version of the Cholesky factorization uses implicit dependency computation. * The whole algorithm thus appears clearly in the task submission loop in _cholesky(). */ /* Note: this is using fortran ordering, i.e. column-major ordering, i.e. * elements with consecutive row number are consecutive in memory */ #include "cholesky.h" #include "../sched_ctx_utils/sched_ctx_utils.h" #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA) #include "magma.h" #endif /* * code to bootstrap the factorization * and construct the DAG */ static void callback_turn_spmd_on(void *arg) { (void)arg; cl22.type = STARPU_SPMD; } static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks) { double start; double end; unsigned k,m,n; unsigned long nx = starpu_matrix_get_nx(dataA); unsigned long nn = nx/nblocks; unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN; if (bound_p || bound_lp_p || bound_mps_p) starpu_bound_start(bound_deps_p, 0); starpu_fxt_start_profiling(); start = starpu_timing_now(); /* create all the DAG nodes */ for (k = 0; k < nblocks; k++) { int ret; starpu_iteration_push(k); starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k); ret = starpu_task_insert(&cl11, STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO, STARPU_RW, sdatakk, STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL, STARPU_FLOPS, (double) FLOPS_SPOTRF(nn), STARPU_TAG_ONLY, TAG11(k), 0); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert"); for (m = k+1; m m) { mat[m+n*size] = 0.0f; /* debug */ } } } float *test_mat = malloc(size*size*sizeof(float)); STARPU_ASSERT(test_mat); STARPU_SSYRK("L", "N", size, size, 1.0f, mat, size, 0.0f, test_mat, size); FPRINTF(stderr, "comparing results ...\n"); #ifdef PRINT_OUTPUT for (m = 0; m < size; m++) { for (n = 0; n < size; n++) { if (n <= m) { FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size]); } else { FPRINTF(stdout, ".\t"); } } FPRINTF(stdout, "\n"); } #endif for (m = 0; m < size; m++) { for (n = 0; n < size; n++) { if (n <= m) { float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f); float err = fabsf(test_mat[m +n*size] - orig) / orig; if (err > 0.0001) { FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size], orig, err); assert(0); } } } } free(test_mat); } starpu_free_flags(mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED); #endif } int main(int argc, char **argv) { /* create a simple definite positive symetric matrix example * * Hilbert matrix : h(i,j) = 1/(i+j+1) * */ #ifdef STARPU_HAVE_MAGMA magma_init(); #endif int ret; ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); //starpu_fxt_stop_profiling(); init_sizes(); parse_args(argc, argv); if(with_ctxs_p || with_noctxs_p || chole1_p || chole2_p) parse_args_ctx(argc, argv); #ifdef STARPU_USE_CUDA initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost); initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost); initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost); #else initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL); initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL); initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL); #endif starpu_cublas_init(); if(with_ctxs_p) { construct_contexts(); start_2benchs(execute_cholesky); } else if(with_noctxs_p) start_2benchs(execute_cholesky); else if(chole1_p) start_1stbench(execute_cholesky); else if(chole2_p) start_2ndbench(execute_cholesky); else execute_cholesky(size_p, nblocks_p); starpu_cublas_shutdown(); starpu_shutdown(); return 0; }