|
@@ -1,6 +1,6 @@
|
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
*
|
|
|
- * Copyright (C) 2009, 2010 Université de Bordeaux 1
|
|
|
+ * Copyright (C) 2009, 2010, 2011 Université de Bordeaux 1
|
|
|
* Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
|
|
|
* Copyright (C) 2010 Centre National de la Recherche Scientifique
|
|
|
*
|
|
@@ -16,37 +16,79 @@
|
|
|
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
|
|
|
*/
|
|
|
|
|
|
-#include "dw_mult.h"
|
|
|
-
|
|
|
-TYPE *A, *B, *C;
|
|
|
-starpu_data_handle A_handle, B_handle, C_handle;
|
|
|
-
|
|
|
-/*
|
|
|
- * This program computes C = A * B
|
|
|
- *
|
|
|
- * A of size (z,y)
|
|
|
- * B of size (x,z)
|
|
|
- * C of size (x,y)
|
|
|
-
|
|
|
- |---------------|
|
|
|
- z | B |
|
|
|
- |---------------|
|
|
|
- z x
|
|
|
- |----| |---------------|
|
|
|
- | | | |
|
|
|
- | | | |
|
|
|
- | A | y | C |
|
|
|
- | | | |
|
|
|
- | | | |
|
|
|
- |----| |---------------|
|
|
|
+#include <string.h>
|
|
|
+#include <math.h>
|
|
|
+#include <sys/types.h>
|
|
|
+#include <sys/time.h>
|
|
|
+#include <starpu.h>
|
|
|
|
|
|
- */
|
|
|
+#include <common/blas.h>
|
|
|
+
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+#include <cuda.h>
|
|
|
+#include <cublas.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+static unsigned niter = 100;
|
|
|
+static unsigned nslicesx = 4;
|
|
|
+static unsigned nslicesy = 4;
|
|
|
+static unsigned xdim = 256;
|
|
|
+static unsigned ydim = 256;
|
|
|
+static unsigned zdim = 64;
|
|
|
+static unsigned check = 0;
|
|
|
+
|
|
|
+static TYPE *A, *B, *C;
|
|
|
+static starpu_data_handle A_handle, B_handle, C_handle;
|
|
|
+
|
|
|
+static void parse_args(int argc, char **argv)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+ for (i = 1; i < argc; i++) {
|
|
|
+ if (strcmp(argv[i], "-nblocks") == 0) {
|
|
|
+ char *argptr;
|
|
|
+ nslicesx = strtol(argv[++i], &argptr, 10);
|
|
|
+ nslicesy = nslicesx;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (strcmp(argv[i], "-nblocksx") == 0) {
|
|
|
+ char *argptr;
|
|
|
+ nslicesx = strtol(argv[++i], &argptr, 10);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (strcmp(argv[i], "-nblocksy") == 0) {
|
|
|
+ char *argptr;
|
|
|
+ nslicesy = strtol(argv[++i], &argptr, 10);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (strcmp(argv[i], "-x") == 0) {
|
|
|
+ char *argptr;
|
|
|
+ xdim = strtol(argv[++i], &argptr, 10);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (strcmp(argv[i], "-y") == 0) {
|
|
|
+ char *argptr;
|
|
|
+ ydim = strtol(argv[++i], &argptr, 10);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (strcmp(argv[i], "-z") == 0) {
|
|
|
+ char *argptr;
|
|
|
+ zdim = strtol(argv[++i], &argptr, 10);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (strcmp(argv[i], "-iter") == 0) {
|
|
|
+ char *argptr;
|
|
|
+ niter = strtol(argv[++i], &argptr, 10);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (strcmp(argv[i], "-check") == 0) {
|
|
|
+ check = 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
|
|
|
static void check_output(void)
|
|
|
{
|
|
|
- /* check results */
|
|
|
/* compute C = C - AB */
|
|
|
-
|
|
|
CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0f, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
|
|
|
|
|
|
/* make sure C = 0 */
|
|
@@ -65,66 +107,24 @@ static void check_output(void)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-void callback_func(void *arg)
|
|
|
-{
|
|
|
- /* do some accounting */
|
|
|
- int id = starpu_worker_get_id();
|
|
|
- flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
|
|
|
- ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
|
|
|
-}
|
|
|
-
|
|
|
static void init_problem_data(void)
|
|
|
{
|
|
|
unsigned i,j;
|
|
|
|
|
|
-#ifdef STARPU_USE_CUDA
|
|
|
- if (pin) {
|
|
|
- starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
|
|
|
- starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
|
|
|
- starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
|
|
|
- } else
|
|
|
-#endif
|
|
|
- {
|
|
|
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
|
|
|
- posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(TYPE));
|
|
|
- posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(TYPE));
|
|
|
- posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(TYPE));
|
|
|
-#else
|
|
|
- A = malloc(zdim*ydim*sizeof(TYPE));
|
|
|
- B = malloc(xdim*zdim*sizeof(TYPE));
|
|
|
- C = malloc(xdim*ydim*sizeof(TYPE));
|
|
|
-#endif
|
|
|
- }
|
|
|
+ starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
|
|
|
+ starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
|
|
|
+ starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
|
|
|
|
|
|
/* fill the A and B matrices */
|
|
|
- if (norandom) {
|
|
|
- for (j=0; j < ydim; j++) {
|
|
|
- for (i=0; i < zdim; i++) {
|
|
|
- A[j+i*ydim] = (TYPE)(i);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- for (j=0; j < zdim; j++) {
|
|
|
- for (i=0; i < xdim; i++) {
|
|
|
- B[j+i*zdim] = (TYPE)(j);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- else {
|
|
|
-#ifdef NORANDOM
|
|
|
- srand(2008);
|
|
|
- STARPU_ABORT();
|
|
|
-#endif
|
|
|
- for (j=0; j < ydim; j++) {
|
|
|
- for (i=0; i < zdim; i++) {
|
|
|
- A[j+i*ydim] = (TYPE)(starpu_drand48());
|
|
|
- }
|
|
|
+ for (j=0; j < ydim; j++) {
|
|
|
+ for (i=0; i < zdim; i++) {
|
|
|
+ A[j+i*ydim] = (TYPE)(starpu_drand48());
|
|
|
}
|
|
|
-
|
|
|
- for (j=0; j < zdim; j++) {
|
|
|
- for (i=0; i < xdim; i++) {
|
|
|
- B[j+i*zdim] = (TYPE)(starpu_drand48());
|
|
|
- }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (j=0; j < zdim; j++) {
|
|
|
+ for (i=0; i < xdim; i++) {
|
|
|
+ B[j+i*zdim] = (TYPE)(starpu_drand48());
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -133,8 +133,6 @@ static void init_problem_data(void)
|
|
|
C[j+i*ydim] = (TYPE)(0);
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- display_memory_consumption();
|
|
|
}
|
|
|
|
|
|
static void partition_mult_data(void)
|
|
@@ -148,21 +146,15 @@ static void partition_mult_data(void)
|
|
|
|
|
|
starpu_data_set_wt_mask(C_handle, 1<<0);
|
|
|
|
|
|
- conf.k = zdim;
|
|
|
- conf.m = ydim/nslicesy;
|
|
|
- conf.n = xdim/nslicesx;
|
|
|
-
|
|
|
struct starpu_data_filter f;
|
|
|
+ memset(&f, 0, sizeof(f));
|
|
|
f.filter_func = starpu_vertical_block_filter_func;
|
|
|
f.nchildren = nslicesx;
|
|
|
- f.get_nchildren = NULL;
|
|
|
- f.get_child_ops = NULL;
|
|
|
|
|
|
struct starpu_data_filter f2;
|
|
|
+ memset(&f2, 0, sizeof(f2));
|
|
|
f2.filter_func = starpu_block_filter_func;
|
|
|
f2.nchildren = nslicesy;
|
|
|
- f2.get_nchildren = NULL;
|
|
|
- f2.get_child_ops = NULL;
|
|
|
|
|
|
starpu_data_partition(B_handle, &f);
|
|
|
starpu_data_partition(A_handle, &f2);
|
|
@@ -170,126 +162,110 @@ static void partition_mult_data(void)
|
|
|
starpu_data_map_filters(C_handle, 2, &f, &f2);
|
|
|
}
|
|
|
|
|
|
-static void unpartition_mult_data(void)
|
|
|
+static void mult_kernel_common(void *descr[], int type)
|
|
|
{
|
|
|
- fprintf(stderr, "unpartition !!\n");
|
|
|
+ TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
|
|
|
+ TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
|
|
|
+ TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
|
|
|
|
|
|
- starpu_data_unpartition(C_handle, 0);
|
|
|
+ unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
|
|
|
+ unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
|
|
|
+ unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
|
|
|
|
|
|
- starpu_data_unregister(C_handle);
|
|
|
-}
|
|
|
+ unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
|
|
|
+ unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
|
|
|
+ unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
|
|
|
|
|
|
-static starpu_codelet cl = {
|
|
|
- .where = STARPU_CPU|STARPU_CUDA
|
|
|
-#ifdef SPU_FUNC_SGEMM
|
|
|
- |STARPU_GORDON
|
|
|
-#endif
|
|
|
- ,
|
|
|
- .cpu_func = STARPU_GEMM(cpu_mult),
|
|
|
+ if (type == STARPU_CPU) {
|
|
|
+ CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
|
|
|
+ }
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = STARPU_GEMM(cublas_mult),
|
|
|
-#endif
|
|
|
-#ifdef STARPU_USE_GORDON
|
|
|
-#ifdef SPU_FUNC_SGEMM
|
|
|
- .gordon_func = SPU_FUNC_SGEMM,
|
|
|
-#else
|
|
|
-#warning SPU_FUNC_SGEMM is not available
|
|
|
-#endif
|
|
|
+ else {
|
|
|
+ CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
|
|
|
+ (TYPE)0.0, subC, ldC);
|
|
|
+ cudaThreadSynchronize();
|
|
|
+ }
|
|
|
#endif
|
|
|
- .nbuffers = 3
|
|
|
-};
|
|
|
-
|
|
|
-static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
|
|
|
-{
|
|
|
- /* A B[task] = C[task] */
|
|
|
- struct starpu_task *task = starpu_task_create();
|
|
|
-
|
|
|
- task->cl = &cl;
|
|
|
-
|
|
|
- /* we have a callback to do some accounting */
|
|
|
- task->callback_func = callback_func;
|
|
|
- task->callback_arg = NULL;
|
|
|
-
|
|
|
- task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
|
|
|
- task->buffers[0].mode = STARPU_R;
|
|
|
- task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
|
|
|
- task->buffers[1].mode = STARPU_R;
|
|
|
- task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
|
|
|
- task->buffers[2].mode = STARPU_RW;
|
|
|
-
|
|
|
- task->cl_arg = &conf;
|
|
|
- task->cl_arg_size = sizeof(struct block_conf);
|
|
|
- return task;
|
|
|
}
|
|
|
|
|
|
-static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+static void cublas_mult(void *descr[], __attribute__((unused)) void *arg)
|
|
|
{
|
|
|
- unsigned z;
|
|
|
-
|
|
|
- z = 0;
|
|
|
-
|
|
|
- {
|
|
|
- struct starpu_task *task;
|
|
|
- task = construct_task(x, y, z, iter);
|
|
|
-
|
|
|
- starpu_task_submit(task);
|
|
|
- }
|
|
|
+ mult_kernel_common(descr, STARPU_CUDA);
|
|
|
}
|
|
|
-
|
|
|
-static void launch_codelets(void)
|
|
|
-{
|
|
|
-#ifdef STARPU_USE_FXT
|
|
|
- _starpu_fxt_register_thread(0);
|
|
|
#endif
|
|
|
- /* partition the work into slices */
|
|
|
- unsigned taskx, tasky;
|
|
|
|
|
|
- srand(time(NULL));
|
|
|
+static void cpu_mult(void *descr[], __attribute__((unused)) void *arg)
|
|
|
+{
|
|
|
+ mult_kernel_common(descr, STARPU_CPU);
|
|
|
+}
|
|
|
|
|
|
- /* should we use a single performance model for all archs and use an
|
|
|
- * acceleration factor ? */
|
|
|
- if (use_common_model) {
|
|
|
- cl.model = &STARPU_GEMM(model_common);
|
|
|
- }
|
|
|
- else {
|
|
|
- cl.model = &STARPU_GEMM(model);
|
|
|
- }
|
|
|
+static struct starpu_perfmodel_t starpu_gemm_model = {
|
|
|
+ .type = STARPU_HISTORY_BASED,
|
|
|
+ .symbol = STARPU_GEMM_STR(gemm)
|
|
|
+};
|
|
|
|
|
|
- for (taskx = 0; taskx < nslicesx; taskx++)
|
|
|
- {
|
|
|
- for (tasky = 0; tasky < nslicesy; tasky++)
|
|
|
- {
|
|
|
- submit_new_iter(taskx, tasky, 0);
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
+static starpu_codelet cl = {
|
|
|
+ .where = STARPU_CPU|STARPU_CUDA,
|
|
|
+ .cpu_func = cpu_mult,
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ .cuda_func = cublas_mult,
|
|
|
+#endif
|
|
|
+ .nbuffers = 3,
|
|
|
+ .model = &starpu_gemm_model
|
|
|
+};
|
|
|
|
|
|
-int main(__attribute__ ((unused)) int argc,
|
|
|
- __attribute__ ((unused)) char **argv)
|
|
|
+int main(int argc, char **argv)
|
|
|
{
|
|
|
+ struct timeval start;
|
|
|
+ struct timeval end;
|
|
|
|
|
|
parse_args(argc, argv);
|
|
|
|
|
|
- /* start the runtime */
|
|
|
starpu_init(NULL);
|
|
|
-
|
|
|
starpu_helper_cublas_init();
|
|
|
|
|
|
init_problem_data();
|
|
|
+ partition_mult_data();
|
|
|
|
|
|
gettimeofday(&start, NULL);
|
|
|
|
|
|
- partition_mult_data();
|
|
|
+ unsigned x, y, iter;
|
|
|
+ for (iter = 0; iter < niter; iter++)
|
|
|
+ {
|
|
|
+ for (x = 0; x < nslicesx; x++)
|
|
|
+ for (y = 0; y < nslicesy; y++)
|
|
|
+ {
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+
|
|
|
+ task->cl = &cl;
|
|
|
+
|
|
|
+ task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, y);
|
|
|
+ task->buffers[0].mode = STARPU_R;
|
|
|
+ task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, x);
|
|
|
+ task->buffers[1].mode = STARPU_R;
|
|
|
+ task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, x, y);
|
|
|
+ task->buffers[2].mode = STARPU_RW;
|
|
|
+
|
|
|
+ int ret = starpu_task_submit(task);
|
|
|
+ STARPU_ASSERT(!ret);
|
|
|
+ }
|
|
|
|
|
|
- launch_codelets();
|
|
|
+ starpu_task_wait_for_all();
|
|
|
+ }
|
|
|
|
|
|
- starpu_task_wait_for_all();
|
|
|
|
|
|
gettimeofday(&end, NULL);
|
|
|
double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
- display_stats(timing);
|
|
|
|
|
|
- unpartition_mult_data();
|
|
|
+ fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
|
|
|
+
|
|
|
+ double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
|
|
|
+ *((unsigned long)ydim)*((unsigned long)zdim);
|
|
|
+ fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
|
|
|
+
|
|
|
+ starpu_data_unpartition(C_handle, 0);
|
|
|
+ starpu_data_unregister(C_handle);
|
|
|
|
|
|
if (check)
|
|
|
check_output();
|