|
@@ -0,0 +1,266 @@
|
|
|
+/*
|
|
|
+ * StarPU
|
|
|
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
|
|
|
+ *
|
|
|
+ * This program is free software; you can redistribute it and/or modify
|
|
|
+ * it under the terms of the GNU Lesser General Public License as published by
|
|
|
+ * the Free Software Foundation; either version 2.1 of the License, or (at
|
|
|
+ * your option) any later version.
|
|
|
+ *
|
|
|
+ * This program is distributed in the hope that it will be useful, but
|
|
|
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
+ *
|
|
|
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
|
|
|
+ */
|
|
|
+
|
|
|
+#include "dw_mult.h"
|
|
|
+
|
|
|
+
|
|
|
+float *A, *B, *C;
|
|
|
+starpu_data_handle A_handle, B_handle, C_handle;
|
|
|
+
|
|
|
+pthread_mutex_t mutex;
|
|
|
+pthread_cond_t cond;
|
|
|
+
|
|
|
+/*
|
|
|
+ * That program should compute C = A * B
|
|
|
+ *
|
|
|
+ * A of size (z,y)
|
|
|
+ * B of size (x,z)
|
|
|
+ * C of size (x,y)
|
|
|
+
|
|
|
+ |---------------|
|
|
|
+ z | B |
|
|
|
+ |---------------|
|
|
|
+ z x
|
|
|
+ |----| |---------------|
|
|
|
+ | | | |
|
|
|
+ | | | |
|
|
|
+ | A | y | C |
|
|
|
+ | | | |
|
|
|
+ | | | |
|
|
|
+ |----| |---------------|
|
|
|
+
|
|
|
+ */
|
|
|
+
|
|
|
+static void terminate(void)
|
|
|
+{
|
|
|
+ starpu_delete_data(C_handle);
|
|
|
+
|
|
|
+ gettimeofday(&end, NULL);
|
|
|
+
|
|
|
+ double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
+
|
|
|
+ uint64_t total_flop = niter*BLAS3_FLOP(ydim, xdim, zdim);
|
|
|
+ uint64_t total_ls = niter*(ls_cublas + ls_atlas);
|
|
|
+
|
|
|
+ fprintf(stderr, "Computation took (ms):\n");
|
|
|
+ printf("%2.2f\n", timing/1000);
|
|
|
+ fprintf(stderr, " GFlop : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_flop/1000000000.0f, (double)flop_cublas/1000000000.0f, (double)flop_atlas/1000000000.0f);
|
|
|
+ fprintf(stderr, " GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
|
|
|
+ fprintf(stderr, " GB : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_ls/1000000000.0f, (double)ls_cublas/1000000000.0f, (double)ls_atlas/1000000000.0f);
|
|
|
+ fprintf(stderr, " GB/s : %2.2f\n", (double)total_ls / (double)timing/1000);
|
|
|
+
|
|
|
+#ifdef CHECK_OUTPUT
|
|
|
+ /* check results */
|
|
|
+ /* compute C = C - niter * AB */
|
|
|
+
|
|
|
+ SGEMM("N", "N", ydim, xdim, zdim, -1.0f*niter, A, ydim, B, zdim, 1.0f, C, ydim);
|
|
|
+
|
|
|
+ /* make sure C = 0 */
|
|
|
+ float err;
|
|
|
+ err = SASUM(xdim*ydim, C, 1);
|
|
|
+
|
|
|
+ if (err < xdim*ydim*0.001) {
|
|
|
+ fprintf(stderr, "Results are OK\n");
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ fprintf(stderr, "There were errors ... err = %f\n", err);
|
|
|
+ }
|
|
|
+#endif // CHECK_OUTPUT
|
|
|
+
|
|
|
+ pthread_mutex_lock(&mutex);
|
|
|
+ pthread_cond_signal(&cond);
|
|
|
+ pthread_mutex_unlock(&mutex);
|
|
|
+}
|
|
|
+
|
|
|
+#define COMMON_CODE \
|
|
|
+ uint32_t nxC, nyC, nyA; \
|
|
|
+ uint32_t ldA, ldB, ldC; \
|
|
|
+ \
|
|
|
+ float *subA; \
|
|
|
+ float *subB; \
|
|
|
+ float *subC; \
|
|
|
+ \
|
|
|
+ subA = (float *)descr[0].blas.ptr; \
|
|
|
+ subB = (float *)descr[1].blas.ptr; \
|
|
|
+ subC = (float *)descr[2].blas.ptr; \
|
|
|
+ \
|
|
|
+ nxC = descr[2].blas.nx; \
|
|
|
+ nyC = descr[2].blas.ny; \
|
|
|
+ nyA = descr[0].blas.ny; \
|
|
|
+ \
|
|
|
+ ldA = descr[0].blas.ld; \
|
|
|
+ ldB = descr[1].blas.ld; \
|
|
|
+ ldC = descr[2].blas.ld;
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+#ifdef USE_CUDA
|
|
|
+void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
|
|
|
+{
|
|
|
+ COMMON_CODE
|
|
|
+
|
|
|
+ cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB,
|
|
|
+ 0.0f, subC, ldC);
|
|
|
+ cublasStatus st;
|
|
|
+ st = cublasGetError();
|
|
|
+ if (st != CUBLAS_STATUS_SUCCESS)
|
|
|
+ STARPU_ASSERT(0);
|
|
|
+
|
|
|
+ uint64_t flopcnt = BLAS3_FLOP(nyC, nxC, nyA);
|
|
|
+
|
|
|
+ flop_cublas += flopcnt;
|
|
|
+ ls_cublas += BLAS3_LS(nyC, nxC, nyA);
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+void core_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
|
|
|
+{
|
|
|
+ COMMON_CODE
|
|
|
+
|
|
|
+ SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 0.0f, subC, ldC);
|
|
|
+
|
|
|
+ flop_atlas += BLAS3_FLOP(nxC, nyC, nyA);
|
|
|
+ ls_atlas += BLAS3_LS(nxC, nyC, nyA);
|
|
|
+}
|
|
|
+
|
|
|
+static void init_problem_data(void)
|
|
|
+{
|
|
|
+ unsigned i,j;
|
|
|
+
|
|
|
+#ifdef USE_CUDA
|
|
|
+ if (pin) {
|
|
|
+ starpu_malloc_pinned_if_possible(&A, zdim*ydim*sizeof(float));
|
|
|
+ starpu_malloc_pinned_if_possible(&B, xdim*zdim*sizeof(float));
|
|
|
+ starpu_malloc_pinned_if_possible(&C, xdim*ydim*sizeof(float));
|
|
|
+ } else
|
|
|
+#endif
|
|
|
+ {
|
|
|
+#ifdef HAVE_POSIX_MEMALIGN
|
|
|
+ posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float));
|
|
|
+ posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float));
|
|
|
+ posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float));
|
|
|
+#else
|
|
|
+ A = malloc(zdim*ydim*sizeof(float));
|
|
|
+ B = malloc(xdim*zdim*sizeof(float));
|
|
|
+ C = malloc(xdim*ydim*sizeof(float));
|
|
|
+#endif
|
|
|
+ }
|
|
|
+
|
|
|
+ /* fill the A and B matrices */
|
|
|
+ if (norandom) {
|
|
|
+ for (j=0; j < ydim; j++) {
|
|
|
+ for (i=0; i < zdim; i++) {
|
|
|
+ A[j+i*ydim] = (float)(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (j=0; j < zdim; j++) {
|
|
|
+ for (i=0; i < xdim; i++) {
|
|
|
+ B[j+i*zdim] = (float)(j);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ else {
|
|
|
+#ifdef NORANDOM
|
|
|
+ srand(2009);
|
|
|
+ STARPU_ASSERT(0);
|
|
|
+#endif
|
|
|
+ for (j=0; j < ydim; j++) {
|
|
|
+ for (i=0; i < zdim; i++) {
|
|
|
+ A[j+i*ydim] = (float)(drand48());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (j=0; j < zdim; j++) {
|
|
|
+ for (i=0; i < xdim; i++) {
|
|
|
+ B[j+i*zdim] = (float)(drand48());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (j=0; j < ydim; j++) {
|
|
|
+ for (i=0; i < xdim; i++) {
|
|
|
+ C[j+i*ydim] = (float)(0);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ display_memory_consumption();
|
|
|
+
|
|
|
+ starpu_register_blas_data(&A_handle, 0, (uintptr_t)A,
|
|
|
+ ydim, ydim, zdim, sizeof(float));
|
|
|
+ starpu_register_blas_data(&B_handle, 0, (uintptr_t)B,
|
|
|
+ zdim, zdim, xdim, sizeof(float));
|
|
|
+ starpu_register_blas_data(&C_handle, 0, (uintptr_t)C,
|
|
|
+ ydim, ydim, xdim, sizeof(float));
|
|
|
+
|
|
|
+ gettimeofday(&start, NULL);
|
|
|
+}
|
|
|
+
|
|
|
+static void launch_codelets(void)
|
|
|
+{
|
|
|
+ srand(time(NULL));
|
|
|
+
|
|
|
+ starpu_codelet cl = {
|
|
|
+ .where = CORE|CUBLAS,
|
|
|
+ .core_func = core_mult,
|
|
|
+#ifdef USE_CUDA
|
|
|
+ .cublas_func = cublas_mult,
|
|
|
+#endif
|
|
|
+ .model = &sgemm_model_common,
|
|
|
+ .nbuffers = 3
|
|
|
+ };
|
|
|
+
|
|
|
+ unsigned iter;
|
|
|
+ for (iter = 0; iter < niter; iter++)
|
|
|
+ {
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+
|
|
|
+ task->cl = &cl;
|
|
|
+
|
|
|
+ task->buffers[0].handle = A_handle;
|
|
|
+ task->buffers[0].mode = STARPU_R;
|
|
|
+ task->buffers[1].handle = B_handle;
|
|
|
+ task->buffers[1].mode = STARPU_R;
|
|
|
+ task->buffers[2].handle = C_handle;
|
|
|
+ task->buffers[2].mode = STARPU_RW;
|
|
|
+
|
|
|
+ task->synchronous = 1;
|
|
|
+ starpu_submit_task(task);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+int main(__attribute__ ((unused)) int argc,
|
|
|
+ __attribute__ ((unused)) char **argv)
|
|
|
+{
|
|
|
+
|
|
|
+ parse_args(argc, argv);
|
|
|
+
|
|
|
+ /* start the runtime */
|
|
|
+ starpu_init(NULL);
|
|
|
+
|
|
|
+ pthread_mutex_init(&mutex, NULL);
|
|
|
+ pthread_cond_init(&cond, NULL);
|
|
|
+
|
|
|
+ init_problem_data();
|
|
|
+
|
|
|
+ launch_codelets();
|
|
|
+
|
|
|
+ terminate();
|
|
|
+
|
|
|
+ starpu_shutdown();
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|