/* * StarPU * Copyright (C) INRIA 2008-2009 (see AUTHORS file) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef USE_CUDA #include #endif #define BLOCK 75 #include "starpu-blas-wrapper.h" extern struct data_interface_ops_t interface_blas_ops; static int core_sgemm = 0; static int cublas_sgemm = 0; static int core_strsm = 0; static int cublas_strsm = 0; static int inited = 0; void STARPU_INIT(void) { if (!inited) { inited = 1; starpu_init(NULL); } } void STARPU_TERMINATE(void) { starpu_shutdown(); fprintf(stderr, "sgemm : core %d cublas %d\n", core_sgemm, cublas_sgemm); fprintf(stderr, "strsm : core %d cublas %d\n", core_strsm, cublas_strsm); } /* * * Specific to PaStiX ! * */ /* * * We "need" some custom filters * * VECTOR * (n) * / | \ * VECTOR BLAS VECTOR * (n1) (n2) * * if n1 = 0 : * VECTOR * / \ * BLAS VECTOR */ struct divide_vector_in_blas_filter_args { uint32_t n1, n2; /* (total size of the first portion (vector length) n < root's n ! */ uint32_t stride; /* stride of the first portion (need to be a multiple of n */ }; unsigned divide_vector_in_blas_filter(starpu_filter *f, starpu_data_handle root_data) { starpu_vector_interface_t *vector_root = &root_data->interface[0].vector; uint32_t nx = vector_root->nx; size_t elemsize = vector_root->elemsize; struct divide_vector_in_blas_filter_args *args = f->filter_arg_ptr; unsigned n1 = args->n1; unsigned n2 = args->n2; unsigned stride = args->stride; STARPU_ASSERT(n1 + n2 < nx); unsigned n3 = nx - n1 - n2; /* first allocate the children starpu_data_handle */ root_data->children = calloc((n1==0)?2:3, sizeof(starpu_data_handle)); STARPU_ASSERT(root_data->children); STARPU_ASSERT((n2 % args->stride) == 0); unsigned child = 0; unsigned node; if (n1 > 0) { for (node = 0; node < MAXNODES; node++) { starpu_vector_interface_t *local = &root_data->children[child].interface[node].vector; local->nx = n1; local->elemsize = elemsize; if (root_data->per_node[node].allocated) { local->ptr = root_data->interface[node].vector.ptr; } } child++; } for (node = 0; node < MAXNODES; node++) { starpu_blas_interface_t *local = &root_data->children[child].interface[node].blas; local->nx = stride; local->ny = n2/stride; local->ld = stride; local->elemsize = elemsize; if (root_data->per_node[node].allocated) { local->ptr = root_data->interface[node].vector.ptr + n1*elemsize; } struct starpu_data_state_t *state = &root_data->children[child]; state->ops = &interface_blas_ops; } child++; for (node = 0; node < MAXNODES; node++) { starpu_vector_interface_t *local = &root_data->children[child].interface[node].vector; local->nx = n3; local->elemsize = elemsize; if (root_data->per_node[node].allocated) { local->ptr = root_data->interface[node].vector.ptr + (n1+n2)*elemsize; } } return (n1==0)?2:3; } static data_state *cblktab; static void _cublas_cblk_strsm_callback(void *sem) { sem_t *semptr = sem; sem_post(semptr); } void STARPU_MONITOR_DATA(unsigned ncols) { cblktab = calloc(ncols, sizeof(data_state)); } void STARPU_MONITOR_CBLK(unsigned col, float *data, unsigned stride, unsigned width) { //void starpu_monitor_blas_data(struct starpu_data_state_t *state, uint32_t home_node, // uintptr_t ptr, uint32_t ld, uint32_t nx, // uint32_t ny, size_t elemsize); //fprintf(stderr, "col %d data %p stride %d width %d\n", col, data, stride, width); starpu_monitor_blas_data(&cblktab[col], 0 /* home */, (uintptr_t) data, stride, stride, width, sizeof(float)); } static data_state work_block_1; static data_state work_block_2; void allocate_maxbloktab_on_cublas(starpu_data_interface_t *descr __attribute__((unused)), void *arg __attribute__((unused))) { request_data_allocation(&work_block_1, 1); request_data_allocation(&work_block_2, 1); starpu_filter f1, f2; struct divide_vector_in_blas_filter_args args1, args2; f1.filter_func = divide_vector_in_blas_filter; args1.n1 = 1; /* XXX random ... */ args1.n2 = 2; args1.stride = 1; f1.filter_arg_ptr = &args1; starpu_partition_data(&work_block_1, &f1); f2.filter_func = divide_vector_in_blas_filter; args2.n1 = 0; args2.n2 = 2; args2.stride = 1; f2.filter_arg_ptr = &args2; starpu_partition_data(&work_block_2, &f2); } void STARPU_DECLARE_WORK_BLOCKS(float *maxbloktab1, float *maxbloktab2, unsigned solv_coefmax) { starpu_monitor_vector_data(&work_block_1, 0 /* home */, (uintptr_t)maxbloktab1, solv_coefmax, sizeof(float)); starpu_monitor_vector_data(&work_block_2, 0 /* home */, (uintptr_t)maxbloktab2, solv_coefmax, sizeof(float)); starpu_codelet cl; job_t j; sem_t sem; /* initialize codelet */ cl.where = CUBLAS; cl.cublas_func = allocate_maxbloktab_on_cublas; j = job_create(); j->cb = _cublas_cblk_strsm_callback; j->argcb = &sem; j->cl = &cl; j->cl_arg = NULL; j->nbuffers = 0; j->cl->model = NULL; sem_init(&sem, 0, 0U); /* submit the codelet */ submit_job(j); /* wait for its completion */ sem_wait(&sem); sem_destroy(&sem); } void _core_cblk_strsm(starpu_data_interface_t *descr, void *arg __attribute__((unused))) { uint32_t nx, ny, ld; nx = descr[0].blas.nx; ny = descr[0].blas.ny; ld = descr[0].blas.ld; float *diag_cblkdata, *extra_cblkdata; diag_cblkdata = (float *)descr[0].blas.ptr; extra_cblkdata = diag_cblkdata + ny; unsigned m = nx - ny; unsigned n = ny; // SOPALIN_TRSM("R","L","T","U",dimb,dima,fun,ga,stride,gb,stride); core_strsm++; cblas_strsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasUnit, m, n, 1.0f, diag_cblkdata, ld, extra_cblkdata, ld); } void _cublas_cblk_strsm(starpu_data_interface_t *descr, void *arg __attribute__((unused))) { uint32_t nx, ny, ld; nx = descr[0].blas.nx; ny = descr[0].blas.ny; ld = descr[0].blas.ld; float *diag_cblkdata, *extra_cblkdata; diag_cblkdata = (float *)descr[0].blas.ptr; extra_cblkdata = diag_cblkdata + ny; unsigned m = nx - ny; unsigned n = ny; cublas_strsm++; cublasStrsm ('R', 'L', 'T', 'U', m, n, 1.0, diag_cblkdata, ld, extra_cblkdata, ld); cublasStatus st = cublasGetError(); if (st) fprintf(stderr, "ERROR %d\n", st); STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS); } static struct starpu_perfmodel_t starpu_cblk_strsm = { .per_arch = { [STARPU_CORE_DEFAULT] = { .cost_model = starpu_cblk_strsm_core_cost }, [STARPU_CUDA_DEFAULT] = { .cost_model = starpu_cblk_strsm_cuda_cost } }, // .type = REGRESSION_BASED, .type = PER_ARCH, .symbol = "starpu_cblk_strsm" }; void STARPU_CBLK_STRSM(unsigned col) { /* perform a strsm on the block column */ starpu_codelet cl; job_t j; sem_t sem; /* initialize codelet */ cl.where = CORE|CUBLAS; cl.core_func = _core_cblk_strsm; cl.cublas_func = _cublas_cblk_strsm; j = job_create(); // j->where = (starpu_get_blas_nx(&cblktab[col]) > BLOCK && starpu_get_blas_ny(&cblktab[col]) > BLOCK)? CUBLAS:CORE; j->cb = _cublas_cblk_strsm_callback; j->argcb = &sem; j->cl = &cl; j->cl_arg = NULL; j->nbuffers = 1; /* we could be a little more precise actually */ j->buffers[0].state = &cblktab[col]; j->buffers[0].mode = RW; j->cl->model = &starpu_cblk_strsm; sem_init(&sem, 0, 0U); /* submit the codelet */ submit_job(j); /* wait for its completion */ sem_wait(&sem); sem_destroy(&sem); } struct starpu_compute_contrib_compact_args { unsigned stride; int dimi; int dimj; int dima; }; void _core_compute_contrib_compact(starpu_data_interface_t *descr, void *arg) { struct starpu_compute_contrib_compact_args *args = arg; float *gaik = (float *)descr[0].blas.ptr + args->dima; float *gb = (float *)descr[1].blas.ptr; unsigned strideb = (unsigned)descr[1].blas.ld; float *gc = (float *)descr[2].blas.ptr; unsigned stridec = (unsigned)descr[2].blas.ld; core_sgemm++; cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, args->dimi, args->dimj, args->dima, 1.0f, gaik, args->stride, gb, strideb, 0.0 , gc, stridec); } void _cublas_compute_contrib_compact(starpu_data_interface_t *descr, void *arg) { struct starpu_compute_contrib_compact_args *args = arg; float *gaik = (float *)descr[0].blas.ptr + args->dima; float *gb = (float *)descr[1].blas.ptr; unsigned strideb = (unsigned)descr[1].blas.ld; float *gc = (float *)descr[2].blas.ptr; unsigned stridec = (unsigned)descr[2].blas.ld; cublas_sgemm++; cublasSgemm('N','T', args->dimi, args->dimj, args->dima, 1.0, gaik, args->stride, gb, strideb, 0.0, gc, stridec); cublasStatus st = cublasGetError(); if (st) fprintf(stderr, "ERROR %d\n", st); STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS); } static struct starpu_perfmodel_t starpu_compute_contrib_compact = { .per_arch = { [STARPU_CORE_DEFAULT] = { .cost_model = starpu_compute_contrib_compact_core_cost }, [STARPU_CUDA_DEFAULT] = { .cost_model = starpu_compute_contrib_compact_cuda_cost } }, // .type = REGRESSION_BASED, .type = PER_ARCH, .symbol = "starpu_compute_contrib_compact" }; int update_work_blocks(unsigned col, int dimi, int dimj, int dima, int stride) { /* be paranoid XXX */ notify_data_modification(get_sub_data(&work_block_1, 1, 0), 0); notify_data_modification(get_sub_data(&work_block_1, 1, 1), 0); //notify_data_modification(get_sub_data(&work_block_1, 1, 2), 0); notify_data_modification(get_sub_data(&work_block_2, 1, 0), 0); notify_data_modification(get_sub_data(&work_block_2, 1, 1), 0); notify_data_modification(&cblktab[col], 0); starpu_unpartition_data(&work_block_1, 0); starpu_unpartition_data(&work_block_2, 0); starpu_filter f1, f2; struct divide_vector_in_blas_filter_args args1, args2; f1.filter_func = divide_vector_in_blas_filter; args1.n1 = stride - dima - dimi; //STARPU_ASSERT(args1.n1 != 0); args1.n2 = (stride - dima)*dima; args1.stride = (stride - dima); f1.filter_arg_ptr = &args1; starpu_partition_data(&work_block_1, &f1); f2.filter_func = divide_vector_in_blas_filter; args2.n1 = 0; args2.n2 = dimi*dimj; args2.stride = dimi; f2.filter_arg_ptr = &args2; starpu_partition_data(&work_block_2, &f2); return (args1.n1!=0)?3:2; } void STARPU_COMPUTE_CONTRIB_COMPACT(unsigned col, int dimi, int dimj, int dima, int stride) { // CUBLAS_SGEMM("N","T",dimi,dimj,dima, 1.0,gaik,stride,gb,stride-dima, // 0.0 ,gc,dimi); struct starpu_compute_contrib_compact_args args; args.stride = stride; args.dimi = dimi; args.dimj = dimj; args.dima = dima; starpu_codelet cl; job_t j; sem_t sem; /* initialize codelet */ cl.where = CUBLAS|CORE; cl.core_func = _core_compute_contrib_compact; cl.cublas_func = _cublas_compute_contrib_compact; j = job_create(); j->cb = _cublas_cblk_strsm_callback; j->argcb = &sem; j->cl = &cl; j->cl_arg = &args; j->cl->model = &starpu_compute_contrib_compact; int ret; ret = update_work_blocks(col, dimi, dimj, dima, stride); j->nbuffers = 3; /* we could be a little more precise actually */ j->buffers[0].state = &cblktab[col]; // gaik j->buffers[0].mode = R; j->buffers[1].state = get_sub_data(&work_block_1, 1, (ret==2)?0:1); j->buffers[1].mode = R; j->buffers[2].state = get_sub_data(&work_block_2, 1, 0);; j->buffers[2].mode = RW; // XXX W sem_init(&sem, 0, 0U); /* submit the codelet */ submit_job(j); /* wait for its completion */ sem_wait(&sem); sem_destroy(&sem); } /* * * SGEMM * */ struct sgemm_args { char transa; char transb; int m, n, k; float alpha; float beta; }; void _cublas_sgemm(starpu_data_interface_t *descr, void *arg) { float *A, *B, *C; uint32_t nxA, nyA, ldA; uint32_t nxB, nyB, ldB; uint32_t nxC, nyC, ldC; A = (float *)descr[0].blas.ptr; nxA = descr[0].blas.nx; nyA = descr[0].blas.ny; ldA = descr[0].blas.ld; B = (float *)descr[1].blas.ptr; nxB = descr[1].blas.nx; nyB = descr[1].blas.ny; ldB = descr[1].blas.ld; C = (float *)descr[2].blas.ptr; nxC = descr[2].blas.nx; nyC = descr[2].blas.ny; ldC = descr[2].blas.ld; struct sgemm_args *args = arg; // fprintf(stderr, "CUBLAS SGEMM nxA %d nyA %d nxB %d nyB %d nxC %d nyC %d lda %d ldb %d ldc %d\n", nxA, nyA, nxB, nyB, nxC, nyC, ldA, ldB, ldC); // STARPU_ASSERT(nxA == nxC); // STARPU_ASSERT(nyA == nxB); // STARPU_ASSERT(nyB == nyC); // // STARPU_ASSERT(nxA <= ldA); // STARPU_ASSERT(nxB <= ldB); // STARPU_ASSERT(nxC <= ldC); cublasSgemm (args->transa, args->transb, args->m, args->n, args->k, args->alpha, A, (int)ldA, B, (int)ldB, args->beta, C, (int)ldC); cublasStatus st = cublasGetError(); if (st) fprintf(stderr, "ERROR %d\n", st); STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS); } static void _cublas_sgemm_callback(void *sem) { sem_t *semptr = sem; sem_post(semptr); } void STARPU_SGEMM (const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc) { struct sgemm_args args; args.transa = *transa; args.transb = *transb; args.alpha = alpha; args.beta = beta; args.m = m; args.n = n; args.k = k; data_state A_state; data_state B_state; data_state C_state; starpu_codelet cl; job_t j; sem_t sem; // fprintf(stderr, "STARPU - SGEMM - TRANSA %c TRANSB %c m %d n %d k %d lda %d ldb %d ldc %d \n", *transa, *transb, m, n, k, lda, ldb, ldc); if (toupper(*transa) == 'N') { starpu_monitor_blas_data(&A_state, 0, (uintptr_t)A, lda, m, k, sizeof(float)); } else { starpu_monitor_blas_data(&A_state, 0, (uintptr_t)A, lda, k, m, sizeof(float)); } if (toupper(*transb) == 'N') { starpu_monitor_blas_data(&B_state, 0, (uintptr_t)B, ldb, k, n, sizeof(float)); } else { starpu_monitor_blas_data(&B_state, 0, (uintptr_t)B, ldb, n, k, sizeof(float)); } starpu_monitor_blas_data(&C_state, 0, (uintptr_t)C, ldc, m, n, sizeof(float)); /* initialize codelet */ cl.where = CUBLAS; //cl.core_func = _core_strsm; cl.cublas_func = _cublas_sgemm; j = job_create(); j->cb = _cublas_sgemm_callback; j->argcb = &sem; j->cl = &cl; j->cl_arg = &args; j->nbuffers = 3; j->buffers[0].state = &A_state; j->buffers[0].mode = R; j->buffers[1].state = &B_state; j->buffers[1].mode = R; j->buffers[2].state = &C_state; j->buffers[2].mode = RW; j->cl->model = NULL; sem_init(&sem, 0, 0U); /* submit the codelet */ submit_job(j); /* wait for its completion */ sem_wait(&sem); sem_destroy(&sem); /* make sure data are in memory again */ starpu_unpartition_data(&A_state, 0); starpu_unpartition_data(&B_state, 0); starpu_unpartition_data(&C_state, 0); //starpu_delete_data(&A_state); //starpu_delete_data(&B_state); //starpu_delete_data(&C_state); // fprintf(stderr, "SGEMM done\n"); } /* * * STRSM * */ struct strsm_args { char side; char uplo; char transa; char diag; float alpha; int m,n; }; // //void _core_strsm(starpu_data_interface_t *descr, void *arg) //{ // float *A, *B; // uint32_t nxA, nyA, ldA; // uint32_t nxB, nyB, ldB; // // A = (float *)descr[0].blas.ptr; // nxA = descr[0].blas.nx; // nyA = descr[0].blas.ny; // ldA = descr[0].blas.ld; // // B = (float *)descr[1].blas.ptr; // nxB = descr[1].blas.nx; // nyB = descr[1].blas.ny; // ldB = descr[1].blas.ld; // // struct strsm_args *args = arg; // // fprintf(stderr, "CORE STRSM nxA %d nyA %d nxB %d nyB %d lda %d ldb %d\n", nxA, nyA, nxB, nyB, ldA, ldB); // // SOPALIN_TRSM("R","L","T","U",dimb,dima,fun,ga,stride,gb,stride); // //} /* * * * */ void CUBLAS_SGEMM (const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc) { int ka, kb; float *devPtrA, *devPtrB, *devPtrC; // printf("CUBLAS SGEMM : m %d n %d k %d lda %d ldb %d ldc %d\n", m, n, k, lda, ldb, ldc); /* A - REAL array of DIMENSION ( LDA, ka ), where ka is * k when TRANSA = 'N' or 'n', and is m otherwise. * Before entry with TRANSA = 'N' or 'n', the leading m by k * part of the array A must contain the matrix A, otherwise * the leading k by m part of the array A must contain the * matrix A. */ ka = (toupper(transa[0]) == 'N') ? k : m; cublasAlloc (lda * ka, sizeof(devPtrA[0]), (void**)&devPtrA); if (toupper(transa[0]) == 'N') { cublasSetMatrix (STARPU_MIN(m,lda), k, sizeof(A[0]), A, lda, devPtrA, lda); } else { cublasSetMatrix (STARPU_MIN(k,lda), m, sizeof(A[0]), A, lda, devPtrA, lda); } /* B - REAL array of DIMENSION ( LDB, kb ), where kb is * n when TRANSB = 'N' or 'n', and is k otherwise. * Before entry with TRANSB = 'N' or 'n', the leading k by n * part of the array B must contain the matrix B, otherwise * the leading n by k part of the array B must contain the * matrix B. */ kb = (toupper(transb[0]) == 'N') ? n : k; cublasAlloc (ldb * kb, sizeof(devPtrB[0]), (void**)&devPtrB); if (toupper(transb[0]) == 'N') { cublasSetMatrix (STARPU_MIN(k,ldb), n, sizeof(B[0]), B, ldb, devPtrB, ldb); } else { cublasSetMatrix (STARPU_MIN(n,ldb), k, sizeof(B[0]), B, ldb, devPtrB, ldb); } /* C - REAL array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n matrix */ cublasAlloc ((ldc) * (n), sizeof(devPtrC[0]), (void**)&devPtrC); cublasSetMatrix (STARPU_MIN(m,ldc), n, sizeof(C[0]), C, ldc, devPtrC, ldc); cublasSgemm (transa[0], transb[0], m, n, k, alpha, devPtrA, lda, devPtrB, ldb, beta, devPtrC, ldc); cublasGetMatrix (STARPU_MIN(m,ldc), n, sizeof(C[0]), devPtrC, ldc, C, ldc); cublasFree (devPtrA); cublasFree (devPtrB); cublasFree (devPtrC); }