123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753 |
- /*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
- #include <semaphore.h>
- #include <core/jobs.h>
- #include <core/workers.h>
- #include <core/dependencies/tags.h>
- #include <string.h>
- #include <math.h>
- #include <sys/types.h>
- #include <ctype.h>
- #include <pthread.h>
- #include <signal.h>
- #include <cblas.h>
- #include <datawizard/datawizard.h>
- #include <task-models/blas_model.h>
- #include <common/fxt.h>
- #include <starpu.h>
- #ifdef USE_CUDA
- #include <cuda.h>
- #endif
- #define BLOCK 75
- #include "starpu-blas-wrapper.h"
- extern struct data_interface_ops_t interface_blas_ops;
- static int core_sgemm = 0;
- static int cublas_sgemm = 0;
- static int core_strsm = 0;
- static int cublas_strsm = 0;
- static int inited = 0;
- void STARPU_INIT(void)
- {
- if (!inited) {
- inited = 1;
- starpu_init(NULL);
- }
- }
- void STARPU_TERMINATE(void)
- {
- starpu_shutdown();
- fprintf(stderr, "sgemm : core %d cublas %d\n", core_sgemm, cublas_sgemm);
- fprintf(stderr, "strsm : core %d cublas %d\n", core_strsm, cublas_strsm);
- }
- /*
- *
- * Specific to PaStiX !
- *
- */
- /*
- *
- * We "need" some custom filters
- *
- * VECTOR
- * (n)
- * / | \
- * VECTOR BLAS VECTOR
- * (n1) (n2)
- *
- * if n1 = 0 :
- * VECTOR
- * / \
- * BLAS VECTOR
- */
- struct divide_vector_in_blas_filter_args {
- uint32_t n1, n2; /* (total size of the first portion (vector length) n < root's n ! */
- uint32_t stride; /* stride of the first portion (need to be a multiple of n */
- };
- void divide_vector_in_blas_filter(starpu_filter *f, starpu_data_handle root_data)
- {
- starpu_vector_interface_t *vector_root = &root_data->interface[0].vector;
- uint32_t nx = vector_root->nx;
- size_t elemsize = vector_root->elemsize;
- struct divide_vector_in_blas_filter_args *args = f->filter_arg_ptr;
- unsigned n1 = args->n1;
- unsigned n2 = args->n2;
- unsigned stride = args->stride;
- STARPU_ASSERT(n1 + n2 < nx);
- unsigned n3 = nx - n1 - n2;
-
- /* first allocate the children starpu_data_handle */
- starpu_data_create_children(root_data, (n1==0)?2:3, root_data->ops);
- STARPU_ASSERT((n2 % args->stride) == 0);
- unsigned child = 0;
- unsigned node;
-
- if (n1 > 0)
- {
- for (node = 0; node < STARPU_MAXNODES; node++)
- {
- starpu_vector_interface_t *local = &root_data->children[child].interface[node].vector;
-
- local->nx = n1;
- local->elemsize = elemsize;
-
- if (root_data->per_node[node].allocated) {
- local->ptr = root_data->interface[node].vector.ptr;
- }
-
- }
- child++;
- }
-
- for (node = 0; node < STARPU_MAXNODES; node++)
- {
- starpu_blas_interface_t *local = &root_data->children[child].interface[node].blas;
- local->nx = stride;
- local->ny = n2/stride;
- local->ld = stride;
- local->elemsize = elemsize;
- if (root_data->per_node[node].allocated) {
- local->ptr = root_data->interface[node].vector.ptr + n1*elemsize;
- }
- struct starpu_data_state_t *state = &root_data->children[child];
- state->ops = &interface_blas_ops;
- }
- child++;
- for (node = 0; node < STARPU_MAXNODES; node++)
- {
- starpu_vector_interface_t *local = &root_data->children[child].interface[node].vector;
- local->nx = n3;
- local->elemsize = elemsize;
- if (root_data->per_node[node].allocated) {
- local->ptr = root_data->interface[node].vector.ptr + (n1+n2)*elemsize;
- }
- }
- }
- static data_state *cblktab;
- static void _cublas_cblk_strsm_callback(void *sem)
- {
- sem_t *semptr = sem;
- sem_post(semptr);
- }
- void STARPU_MONITOR_DATA(unsigned ncols)
- {
- cblktab = calloc(ncols, sizeof(data_state));
- }
- void STARPU_MONITOR_CBLK(unsigned col, float *data, unsigned stride, unsigned width)
- {
- //void starpu_register_blas_data(struct starpu_data_state_t *state, uint32_t home_node,
- // uintptr_t ptr, uint32_t ld, uint32_t nx,
- // uint32_t ny, size_t elemsize);
- //fprintf(stderr, "col %d data %p stride %d width %d\n", col, data, stride, width);
- starpu_register_blas_data(&cblktab[col], 0 /* home */,
- (uintptr_t) data, stride, stride, width, sizeof(float));
-
- }
- static data_state work_block_1;
- static data_state work_block_2;
- void allocate_maxbloktab_on_cublas(void *descr[] __attribute__((unused)), void *arg __attribute__((unused)))
- {
- starpu_request_data_allocation(&work_block_1, 1);
- starpu_request_data_allocation(&work_block_2, 1);
- starpu_filter f1, f2;
- struct divide_vector_in_blas_filter_args args1, args2;
- f1.filter_func = divide_vector_in_blas_filter;
- args1.n1 = 1; /* XXX random ... */
- args1.n2 = 2;
- args1.stride = 1;
- f1.filter_arg_ptr = &args1;
- starpu_partition_data(&work_block_1, &f1);
- f2.filter_func = divide_vector_in_blas_filter;
- args2.n1 = 0;
- args2.n2 = 2;
- args2.stride = 1;
- f2.filter_arg_ptr = &args2;
- starpu_partition_data(&work_block_2, &f2);
- }
- void STARPU_DECLARE_WORK_BLOCKS(float *maxbloktab1, float *maxbloktab2, unsigned solv_coefmax)
- {
- starpu_register_vector_data(&work_block_1, 0 /* home */, (uintptr_t)maxbloktab1, solv_coefmax, sizeof(float));
- starpu_register_vector_data(&work_block_2, 0 /* home */, (uintptr_t)maxbloktab2, solv_coefmax, sizeof(float));
- starpu_codelet cl;
- job_t j;
- sem_t sem;
- /* initialize codelet */
- cl.where = CUDA;
- cl.cuda_func = allocate_maxbloktab_on_cublas;
-
- j = _starpu_job_create();
- j->cb = _cublas_cblk_strsm_callback;
- j->argcb = &sem;
- j->cl = &cl;
- j->cl_arg = NULL;
- j->nbuffers = 0;
- j->cl->model = NULL;
- sem_init(&sem, 0, 0U);
-
- /* submit the codelet */
- submit_job(j);
- /* wait for its completion */
- sem_wait(&sem);
- sem_destroy(&sem);
- }
- void _core_cblk_strsm(void *descr[], void *arg __attribute__((unused)))
- {
- uint32_t nx, ny, ld;
- nx = GET_BLAS_NX(descr[0]);
- ny = GET_BLAS_NY(descr[0]);
- ld = GET_BLAS_LD(descr[0]);
- float *diag_cblkdata, *extra_cblkdata;
- diag_cblkdata = (float *)GET_BLAS_PTR(descr[0]);
- extra_cblkdata = diag_cblkdata + ny;
- unsigned m = nx - ny;
- unsigned n = ny;
- // SOPALIN_TRSM("R","L","T","U",dimb,dima,fun,ga,stride,gb,stride);
- core_strsm++;
- cblas_strsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasUnit, m, n, 1.0f,
- diag_cblkdata, ld, extra_cblkdata, ld);
- }
- void _cublas_cblk_strsm(void *descr[], void *arg __attribute__((unused)))
- {
- uint32_t nx, ny, ld;
- nx = GET_BLAS_NX(descr[0]);
- ny = GET_BLAS_NY(descr[0]);
- ld = GET_BLAS_LD(descr[0]);
- float *diag_cblkdata, *extra_cblkdata;
- diag_cblkdata = (float *)GET_BLAS_PTR(descr[0]);
- extra_cblkdata = diag_cblkdata + ny;
- unsigned m = nx - ny;
- unsigned n = ny;
-
- cublas_strsm++;
- cublasStrsm ('R', 'L', 'T', 'U', m, n, 1.0,
- diag_cblkdata, ld,
- extra_cblkdata, ld);
- cublasStatus st = cublasGetError();
- if (st) fprintf(stderr, "ERROR %d\n", st);
- STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS);
- }
- static struct starpu_perfmodel_t starpu_cblk_strsm = {
- .per_arch = {
- [STARPU_CORE_DEFAULT] = { .cost_model = starpu_cblk_strsm_core_cost },
- [STARPU_CUDA_DEFAULT] = { .cost_model = starpu_cblk_strsm_cuda_cost }
- },
- // .type = REGRESSION_BASED,
- .type = PER_ARCH,
- .symbol = "starpu_cblk_strsm"
- };
- void STARPU_CBLK_STRSM(unsigned col)
- {
- /* perform a strsm on the block column */
- starpu_codelet cl;
- job_t j;
- sem_t sem;
- /* initialize codelet */
- cl.where = CORE|CUDA;
- cl.core_func = _core_cblk_strsm;
- cl.cuda_func = _cublas_cblk_strsm;
-
- j = _starpu_job_create();
- // j->where = (starpu_get_blas_nx(&cblktab[col]) > BLOCK && starpu_get_blas_ny(&cblktab[col]) > BLOCK)? CUBLAS:CORE;
- j->cb = _cublas_cblk_strsm_callback;
- j->argcb = &sem;
- j->cl = &cl;
- j->cl_arg = NULL;
- j->nbuffers = 1;
- /* we could be a little more precise actually */
- j->buffers[0].handle = &cblktab[col];
- j->buffers[0].mode = STARPU_RW;
-
- j->cl->model = &starpu_cblk_strsm;
- sem_init(&sem, 0, 0U);
-
- /* submit the codelet */
- submit_job(j);
- /* wait for its completion */
- sem_wait(&sem);
- sem_destroy(&sem);
- }
- struct starpu_compute_contrib_compact_args {
- unsigned stride;
- int dimi;
- int dimj;
- int dima;
- };
- void _core_compute_contrib_compact(void *descr[], void *arg)
- {
- struct starpu_compute_contrib_compact_args *args = arg;
- float *gaik = (float *)GET_BLAS_PTR(descr[0]) + args->dima;
- float *gb = (float *)GET_BLAS_PTR(descr[1]);
- unsigned strideb = (unsigned)GET_BLAS_LD(descr[1]);
- float *gc = (float *)GET_BLAS_PTR(descr[2]);
- unsigned stridec = (unsigned)GET_BLAS_LD(descr[2]);
- core_sgemm++;
- cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans,
- args->dimi, args->dimj, args->dima,
- 1.0f, gaik, args->stride,
- gb, strideb,
- 0.0 , gc, stridec);
- }
- void _cublas_compute_contrib_compact(void *descr[], void *arg)
- {
- struct starpu_compute_contrib_compact_args *args = arg;
- float *gaik = (float *)GET_BLAS_PTR(descr[0]) + args->dima;
- float *gb = (float *)GET_BLAS_PTR(descr[1]);
- unsigned strideb = (unsigned)GET_BLAS_LD(descr[1]);
- float *gc = (float *)GET_BLAS_PTR(descr[2]);
- unsigned stridec = (unsigned)GET_BLAS_LD(descr[2]);
-
- cublas_sgemm++;
- cublasSgemm('N','T', args->dimi, args->dimj, args->dima,
- 1.0, gaik, args->stride,
- gb, strideb,
- 0.0, gc, stridec);
- cublasStatus st = cublasGetError();
- if (st) fprintf(stderr, "ERROR %d\n", st);
- STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS);
- }
- static struct starpu_perfmodel_t starpu_compute_contrib_compact = {
- .per_arch = {
- [STARPU_CORE_DEFAULT] = { .cost_model = starpu_compute_contrib_compact_core_cost },
- [STARPU_CUDA_DEFAULT] = { .cost_model = starpu_compute_contrib_compact_cuda_cost }
- },
- // .type = REGRESSION_BASED,
- .type = PER_ARCH,
- .symbol = "starpu_compute_contrib_compact"
- };
- int update_work_blocks(unsigned col, int dimi, int dimj, int dima, int stride)
- {
- /* be paranoid XXX */
- notify_data_modification(get_sub_data(&work_block_1, 1, 0), 0);
- notify_data_modification(get_sub_data(&work_block_1, 1, 1), 0);
- //notify_data_modification(get_sub_data(&work_block_1, 1, 2), 0);
- notify_data_modification(get_sub_data(&work_block_2, 1, 0), 0);
- notify_data_modification(get_sub_data(&work_block_2, 1, 1), 0);
- notify_data_modification(&cblktab[col], 0);
- starpu_unpartition_data(&work_block_1, 0);
- starpu_unpartition_data(&work_block_2, 0);
- starpu_filter f1, f2;
- struct divide_vector_in_blas_filter_args args1, args2;
- f1.filter_func = divide_vector_in_blas_filter;
- args1.n1 = stride - dima - dimi; //STARPU_ASSERT(args1.n1 != 0);
- args1.n2 = (stride - dima)*dima;
- args1.stride = (stride - dima);
- f1.filter_arg_ptr = &args1;
- starpu_partition_data(&work_block_1, &f1);
- f2.filter_func = divide_vector_in_blas_filter;
- args2.n1 = 0;
- args2.n2 = dimi*dimj;
- args2.stride = dimi;
- f2.filter_arg_ptr = &args2;
- starpu_partition_data(&work_block_2, &f2);
- return (args1.n1!=0)?3:2;
- }
- void STARPU_COMPUTE_CONTRIB_COMPACT(unsigned col, int dimi, int dimj, int dima, int stride)
- {
- // CUBLAS_SGEMM("N","T",dimi,dimj,dima, 1.0,gaik,stride,gb,stride-dima,
- // 0.0 ,gc,dimi);
-
- struct starpu_compute_contrib_compact_args args;
- args.stride = stride;
- args.dimi = dimi;
- args.dimj = dimj;
- args.dima = dima;
- starpu_codelet cl;
- job_t j;
- sem_t sem;
- /* initialize codelet */
- cl.where = CUDA|CORE;
- cl.core_func = _core_compute_contrib_compact;
- cl.cuda_func = _cublas_compute_contrib_compact;
-
- j = _starpu_job_create();
- j->cb = _cublas_cblk_strsm_callback;
- j->argcb = &sem;
- j->cl = &cl;
- j->cl_arg = &args;
- j->cl->model = &starpu_compute_contrib_compact;
- int ret;
- ret = update_work_blocks(col, dimi, dimj, dima, stride);
- j->nbuffers = 3;
- /* we could be a little more precise actually */
- j->buffers[0].handle = &cblktab[col]; // gaik
- j->buffers[0].mode = STARPU_R;
- j->buffers[1].handle = get_sub_data(&work_block_1, 1, (ret==2)?0:1);
- j->buffers[1].mode = STARPU_R;
- j->buffers[2].handle = get_sub_data(&work_block_2, 1, 0);;
- j->buffers[2].mode = STARPU_RW; // XXX STARPU_W
-
- sem_init(&sem, 0, 0U);
-
- /* submit the codelet */
- submit_job(j);
- /* wait for its completion */
- sem_wait(&sem);
- sem_destroy(&sem);
- }
- /*
- *
- * SGEMM
- *
- */
- struct sgemm_args {
- char transa;
- char transb;
- int m, n, k;
- float alpha;
- float beta;
- };
- void _cublas_sgemm(void *descr[], void *arg)
- {
- float *A, *B, *C;
- uint32_t nxA, nyA, ldA;
- uint32_t nxB, nyB, ldB;
- uint32_t nxC, nyC, ldC;
- A = (float *)GET_BLAS_PTR(descr[0]);
- nxA = GET_BLAS_NX(descr[0]);
- nyA = GET_BLAS_NY(descr[0]);
- ldA = GET_BLAS_LD(descr[0]);
- B = (float *)GET_BLAS_PTR(descr[1]);
- nxB = GET_BLAS_NX(descr[1]);
- nyB = GET_BLAS_NY(descr[1]);
- ldB = GET_BLAS_LD(descr[1]);
- C = (float *)GET_BLAS_PTR(descr[2]);
- nxC = GET_BLAS_NX(descr[2]);
- nyC = GET_BLAS_NY(descr[2]);
- ldC = GET_BLAS_LD(descr[2]);
- struct sgemm_args *args = arg;
- // fprintf(stderr, "CUBLAS SGEMM nxA %d nyA %d nxB %d nyB %d nxC %d nyC %d lda %d ldb %d ldc %d\n", nxA, nyA, nxB, nyB, nxC, nyC, ldA, ldB, ldC);
- // STARPU_ASSERT(nxA == nxC);
- // STARPU_ASSERT(nyA == nxB);
- // STARPU_ASSERT(nyB == nyC);
- //
- // STARPU_ASSERT(nxA <= ldA);
- // STARPU_ASSERT(nxB <= ldB);
- // STARPU_ASSERT(nxC <= ldC);
- cublasSgemm (args->transa, args->transb, args->m, args->n, args->k, args->alpha, A, (int)ldA,
- B, (int)ldB, args->beta, C, (int)ldC);
- cublasStatus st = cublasGetError();
- if (st) fprintf(stderr, "ERROR %d\n", st);
- STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS);
- }
- static void _cublas_sgemm_callback(void *sem)
- {
- sem_t *semptr = sem;
- sem_post(semptr);
- }
- void STARPU_SGEMM (const char *transa, const char *transb, const int m,
- const int n, const int k, const float alpha,
- const float *A, const int lda, const float *B,
- const int ldb, const float beta, float *C, const int ldc)
- {
- struct sgemm_args args;
- args.transa = *transa;
- args.transb = *transb;
- args.alpha = alpha;
- args.beta = beta;
- args.m = m;
- args.n = n;
- args.k = k;
- data_state A_state;
- data_state B_state;
- data_state C_state;
- starpu_codelet cl;
- job_t j;
- sem_t sem;
- // fprintf(stderr, "STARPU - SGEMM - TRANSA %c TRANSB %c m %d n %d k %d lda %d ldb %d ldc %d \n", *transa, *transb, m, n, k, lda, ldb, ldc);
- if (toupper(*transa) == 'N')
- {
- starpu_register_blas_data(&A_state, 0, (uintptr_t)A, lda, m, k, sizeof(float));
- }
- else
- {
- starpu_register_blas_data(&A_state, 0, (uintptr_t)A, lda, k, m, sizeof(float));
- }
- if (toupper(*transb) == 'N')
- {
- starpu_register_blas_data(&B_state, 0, (uintptr_t)B, ldb, k, n, sizeof(float));
- }
- else
- {
- starpu_register_blas_data(&B_state, 0, (uintptr_t)B, ldb, n, k, sizeof(float));
- }
- starpu_register_blas_data(&C_state, 0, (uintptr_t)C, ldc, m, n, sizeof(float));
- /* initialize codelet */
- cl.where = CUDA;
- //cl.core_func = _core_strsm;
- cl.cuda_func = _cublas_sgemm;
-
- j = _starpu_job_create();
- j->cb = _cublas_sgemm_callback;
- j->argcb = &sem;
- j->cl = &cl;
- j->cl_arg = &args;
- j->nbuffers = 3;
- j->buffers[0].handle = &A_state;
- j->buffers[0].mode = STARPU_R;
- j->buffers[1].handle = &B_state;
- j->buffers[1].mode = STARPU_R;
- j->buffers[2].handle = &C_state;
- j->buffers[2].mode = STARPU_RW;
-
- j->cl->model = NULL;
- sem_init(&sem, 0, 0U);
-
- /* submit the codelet */
- submit_job(j);
- /* wait for its completion */
- sem_wait(&sem);
- sem_destroy(&sem);
- /* make sure data are in memory again */
- starpu_unpartition_data(&A_state, 0);
- starpu_unpartition_data(&B_state, 0);
- starpu_unpartition_data(&C_state, 0);
- //starpu_delete_data(&A_state);
- //starpu_delete_data(&B_state);
- //starpu_delete_data(&C_state);
-
- // fprintf(stderr, "SGEMM done\n");
- }
- /*
- *
- * STRSM
- *
- */
- struct strsm_args {
- char side;
- char uplo;
- char transa;
- char diag;
- float alpha;
- int m,n;
- };
- //
- //void _core_strsm(void *descr[], void *arg)
- //{
- // float *A, *B;
- // uint32_t nxA, nyA, ldA;
- // uint32_t nxB, nyB, ldB;
- //
- // A = (float *)GET_BLAS_PTR(descr[0]);
- // nxA = GET_BLAS_NX(descr[0]);
- // nyA = GET_BLAS_NY(descr[0]);
- // ldA = GET_BLAS_LD(descr[0]);
- //
- // B = (float *)GET_BLAS_PTR(descr[1]);
- // nxB = GET_BLAS_NX(descr[1]);
- // nyB = GET_BLAS_NY(descr[1]);
- // ldB = GET_BLAS_LD(descr[1]);
- //
- // struct strsm_args *args = arg;
- //
- // fprintf(stderr, "CORE STRSM nxA %d nyA %d nxB %d nyB %d lda %d ldb %d\n", nxA, nyA, nxB, nyB, ldA, ldB);
- //
- // SOPALIN_TRSM("R","L","T","U",dimb,dima,fun,ga,stride,gb,stride);
- //
- //}
- /*
- *
- *
- *
- */
- void CUBLAS_SGEMM (const char *transa, const char *transb, const int m,
- const int n, const int k, const float alpha,
- const float *A, const int lda, const float *B,
- const int ldb, const float beta, float *C, const int ldc)
- {
- int ka, kb;
- float *devPtrA, *devPtrB, *devPtrC;
- // printf("CUBLAS SGEMM : m %d n %d k %d lda %d ldb %d ldc %d\n", m, n, k, lda, ldb, ldc);
- /* A - REAL array of DIMENSION ( LDA, ka ), where ka is
- * k when TRANSA = 'N' or 'n', and is m otherwise.
- * Before entry with TRANSA = 'N' or 'n', the leading m by k
- * part of the array A must contain the matrix A, otherwise
- * the leading k by m part of the array A must contain the
- * matrix A.
- */
- ka = (toupper(transa[0]) == 'N') ? k : m;
- cublasAlloc (lda * ka, sizeof(devPtrA[0]), (void**)&devPtrA);
- if (toupper(transa[0]) == 'N') {
- cublasSetMatrix (STARPU_MIN(m,lda), k, sizeof(A[0]), A, lda, devPtrA,
- lda);
- } else {
- cublasSetMatrix (STARPU_MIN(k,lda), m, sizeof(A[0]), A, lda, devPtrA,
- lda);
- }
- /* B - REAL array of DIMENSION ( LDB, kb ), where kb is
- * n when TRANSB = 'N' or 'n', and is k otherwise.
- * Before entry with TRANSB = 'N' or 'n', the leading k by n
- * part of the array B must contain the matrix B, otherwise
- * the leading n by k part of the array B must contain the
- * matrix B.
- */
- kb = (toupper(transb[0]) == 'N') ? n : k;
- cublasAlloc (ldb * kb, sizeof(devPtrB[0]), (void**)&devPtrB);
- if (toupper(transb[0]) == 'N') {
- cublasSetMatrix (STARPU_MIN(k,ldb), n, sizeof(B[0]), B, ldb, devPtrB,
- ldb);
- } else {
- cublasSetMatrix (STARPU_MIN(n,ldb), k, sizeof(B[0]), B, ldb, devPtrB,
- ldb);
- }
-
- /* C - REAL array of DIMENSION ( LDC, n ).
- * Before entry, the leading m by n part of the array C must
- * contain the matrix C, except when beta is zero, in which
- * case C need not be set on entry.
- * On exit, the array C is overwritten by the m by n matrix
- */
- cublasAlloc ((ldc) * (n), sizeof(devPtrC[0]), (void**)&devPtrC);
- cublasSetMatrix (STARPU_MIN(m,ldc), n, sizeof(C[0]), C, ldc, devPtrC, ldc);
- cublasSgemm (transa[0], transb[0], m, n, k, alpha, devPtrA, lda,
- devPtrB, ldb, beta, devPtrC, ldc);
- cublasGetMatrix (STARPU_MIN(m,ldc), n, sizeof(C[0]), devPtrC, ldc, C, ldc);
- cublasFree (devPtrA);
- cublasFree (devPtrB);
- cublasFree (devPtrC);
- }
|