|
@@ -47,6 +47,7 @@ static unsigned niter = 10;
|
|
|
static unsigned nsleeps = 1;
|
|
|
static unsigned nslicesx = 4;
|
|
|
static unsigned nslicesy = 4;
|
|
|
+static unsigned nslicesz = 4;
|
|
|
#if defined(STARPU_QUICK_CHECK) && !defined(STARPU_SIMGRID)
|
|
|
static unsigned xdim = 256;
|
|
|
static unsigned ydim = 256;
|
|
@@ -59,6 +60,7 @@ static unsigned zdim = 960*4;
|
|
|
static unsigned check = 0;
|
|
|
static unsigned bound = 0;
|
|
|
static unsigned print_hostname = 0;
|
|
|
+static unsigned tiled = 0;
|
|
|
|
|
|
static TYPE *A, *B, *C;
|
|
|
static starpu_data_handle_t A_handle, B_handle, C_handle;
|
|
@@ -131,6 +133,8 @@ static void init_problem_data(void)
|
|
|
|
|
|
static void partition_mult_data(void)
|
|
|
{
|
|
|
+ unsigned x, y, z;
|
|
|
+
|
|
|
starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
|
|
|
ydim, ydim, zdim, sizeof(TYPE));
|
|
|
starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
|
|
@@ -148,19 +152,45 @@ static void partition_mult_data(void)
|
|
|
horiz.filter_func = starpu_matrix_filter_block;
|
|
|
horiz.nchildren = nslicesy;
|
|
|
|
|
|
- starpu_data_partition(B_handle, &vert);
|
|
|
- starpu_data_partition(A_handle, &horiz);
|
|
|
+ if (tiled)
|
|
|
+ {
|
|
|
+ struct starpu_data_filter vertA;
|
|
|
+ memset(&vertA, 0, sizeof(vertA));
|
|
|
+ vertA.filter_func = starpu_matrix_filter_vertical_block;
|
|
|
+ vertA.nchildren = nslicesz;
|
|
|
+
|
|
|
+ struct starpu_data_filter horizB;
|
|
|
+ memset(&horizB, 0, sizeof(horizB));
|
|
|
+ horizB.filter_func = starpu_matrix_filter_block;
|
|
|
+ horizB.nchildren = nslicesz;
|
|
|
+
|
|
|
+ starpu_data_map_filters(A_handle, 2, &vertA, &horiz);
|
|
|
+ starpu_data_map_filters(B_handle, 2, &vert, &horizB);
|
|
|
+ starpu_data_map_filters(C_handle, 2, &vert, &horiz);
|
|
|
+
|
|
|
+ for (y = 0; y < nslicesy; y++)
|
|
|
+ for (z = 0; z < nslicesz; z++)
|
|
|
+ starpu_data_set_coordinates(starpu_data_get_sub_data(A_handle, 2, z, y), 2, z, y);
|
|
|
+
|
|
|
+ for (x = 0; x < nslicesx; x++)
|
|
|
+ for (z = 0; z < nslicesz; z++)
|
|
|
+ starpu_data_set_coordinates(starpu_data_get_sub_data(B_handle, 2, x, z), 2, x, z);
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ starpu_data_partition(B_handle, &vert);
|
|
|
+ starpu_data_partition(A_handle, &horiz);
|
|
|
|
|
|
- starpu_data_map_filters(C_handle, 2, &vert, &horiz);
|
|
|
+ starpu_data_map_filters(C_handle, 2, &vert, &horiz);
|
|
|
+ }
|
|
|
|
|
|
- unsigned x, y;
|
|
|
for (x = 0; x < nslicesx; x++)
|
|
|
for (y = 0; y < nslicesy; y++)
|
|
|
starpu_data_set_coordinates(starpu_data_get_sub_data(C_handle, 2, x, y), 2, x, y);
|
|
|
}
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
-static void cublas_mult(void *descr[], void *arg)
|
|
|
+static void cublas_mult(void *descr[], void *arg, const TYPE *beta)
|
|
|
{
|
|
|
(void)arg;
|
|
|
TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
|
|
@@ -179,13 +209,23 @@ static void cublas_mult(void *descr[], void *arg)
|
|
|
CUBLAS_OP_N, CUBLAS_OP_N,
|
|
|
nxC, nyC, nyA,
|
|
|
&p1, subA, ldA, subB, ldB,
|
|
|
- &v0, subC, ldC);
|
|
|
+ beta, subC, ldC);
|
|
|
if (status != CUBLAS_STATUS_SUCCESS)
|
|
|
STARPU_CUBLAS_REPORT_ERROR(status);
|
|
|
}
|
|
|
+
|
|
|
+static void cublas_gemm0(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ cublas_mult(descr, arg, &v0);
|
|
|
+}
|
|
|
+
|
|
|
+static void cublas_gemm(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ cublas_mult(descr, arg, &p1);
|
|
|
+}
|
|
|
#endif
|
|
|
|
|
|
-void cpu_mult(void *descr[], void *arg)
|
|
|
+void cpu_mult(void *descr[], void *arg, TYPE beta)
|
|
|
{
|
|
|
(void)arg;
|
|
|
TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
|
|
@@ -205,7 +245,7 @@ void cpu_mult(void *descr[], void *arg)
|
|
|
if (worker_size == 1)
|
|
|
{
|
|
|
/* Sequential CPU task */
|
|
|
- CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
|
|
|
+ CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, beta, subC, ldC);
|
|
|
}
|
|
|
else
|
|
|
{
|
|
@@ -220,24 +260,34 @@ void cpu_mult(void *descr[], void *arg)
|
|
|
TYPE *new_subB = &subB[block_size*rank];
|
|
|
TYPE *new_subC = &subC[block_size*rank];
|
|
|
|
|
|
- CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
|
|
|
+ CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, beta, new_subC, ldC);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+void cpu_gemm0(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ cpu_mult(descr, arg, 0.);
|
|
|
+}
|
|
|
+
|
|
|
+void cpu_gemm(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ cpu_mult(descr, arg, 1.);
|
|
|
+}
|
|
|
+
|
|
|
static struct starpu_perfmodel starpu_gemm_model =
|
|
|
{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = STARPU_GEMM_STR(gemm)
|
|
|
};
|
|
|
|
|
|
-static struct starpu_codelet cl =
|
|
|
+static struct starpu_codelet cl_gemm0 =
|
|
|
{
|
|
|
.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
|
|
|
.max_parallelism = INT_MAX,
|
|
|
- .cpu_funcs = {cpu_mult},
|
|
|
- .cpu_funcs_name = {"cpu_mult"},
|
|
|
+ .cpu_funcs = {cpu_gemm0},
|
|
|
+ .cpu_funcs_name = {"cpu_gemm0"},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_funcs = {cublas_mult},
|
|
|
+ .cuda_funcs = {cublas_gemm0},
|
|
|
#elif defined(STARPU_SIMGRID)
|
|
|
.cuda_funcs = {(void*)1},
|
|
|
#endif
|
|
@@ -247,16 +297,39 @@ static struct starpu_codelet cl =
|
|
|
.model = &starpu_gemm_model
|
|
|
};
|
|
|
|
|
|
+static struct starpu_codelet cl_gemm =
|
|
|
+{
|
|
|
+ .type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
|
|
|
+ .max_parallelism = INT_MAX,
|
|
|
+ .cpu_funcs = {cpu_gemm},
|
|
|
+ .cpu_funcs_name = {"cpu_gemm"},
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ .cuda_funcs = {cublas_gemm},
|
|
|
+#elif defined(STARPU_SIMGRID)
|
|
|
+ .cuda_funcs = {(void*)1},
|
|
|
+#endif
|
|
|
+ .cuda_flags = {STARPU_CUDA_ASYNC},
|
|
|
+ .nbuffers = 3,
|
|
|
+ .modes = {STARPU_R, STARPU_R, STARPU_RW},
|
|
|
+ .model = &starpu_gemm_model
|
|
|
+};
|
|
|
+
|
|
|
static void parse_args(int argc, char **argv)
|
|
|
{
|
|
|
int i;
|
|
|
for (i = 1; i < argc; i++)
|
|
|
{
|
|
|
- if (strcmp(argv[i], "-nblocks") == 0)
|
|
|
+ if (strcmp(argv[i], "-3d") == 0)
|
|
|
+ {
|
|
|
+ tiled = 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ else if (strcmp(argv[i], "-nblocks") == 0)
|
|
|
{
|
|
|
char *argptr;
|
|
|
nslicesx = strtol(argv[++i], &argptr, 10);
|
|
|
nslicesy = nslicesx;
|
|
|
+ nslicesz = nslicesx;
|
|
|
}
|
|
|
|
|
|
else if (strcmp(argv[i], "-nblocksx") == 0)
|
|
@@ -271,6 +344,12 @@ static void parse_args(int argc, char **argv)
|
|
|
nslicesy = strtol(argv[++i], &argptr, 10);
|
|
|
}
|
|
|
|
|
|
+ else if (strcmp(argv[i], "-nblocksz") == 0)
|
|
|
+ {
|
|
|
+ char *argptr;
|
|
|
+ nslicesz = strtol(argv[++i], &argptr, 10);
|
|
|
+ }
|
|
|
+
|
|
|
else if (strcmp(argv[i], "-x") == 0)
|
|
|
{
|
|
|
char *argptr;
|
|
@@ -330,12 +409,12 @@ static void parse_args(int argc, char **argv)
|
|
|
|
|
|
else if (strcmp(argv[i], "-spmd") == 0)
|
|
|
{
|
|
|
- cl.type = STARPU_SPMD;
|
|
|
+ cl_gemm0.type = STARPU_SPMD;
|
|
|
}
|
|
|
|
|
|
else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
|
|
|
{
|
|
|
- fprintf(stderr,"Usage: %s [-nblocks n] [-nblocksx x] [-nblocksy y] [-x x] [-y y] [-xy n] [-z z] [-size size] [-iter iter] [-bound] [-check] [-spmd] [-hostname] [-nsleeps nsleeps]\n", argv[0]);
|
|
|
+ fprintf(stderr,"Usage: %s [-3d] [-nblocks n] [-nblocksx x] [-nblocksy y] [-nblocksz z] [-x x] [-y y] [-xy n] [-z z] [-size size] [-iter iter] [-bound] [-check] [-spmd] [-hostname] [-nsleeps nsleeps]\n", argv[0]);
|
|
|
fprintf(stderr,"Currently selected: %ux%u * %ux%u and %ux%u blocks, %u iterations, %u sleeps\n", zdim, ydim, xdim, zdim, nslicesx, nslicesy, niter, nsleeps);
|
|
|
exit(EXIT_SUCCESS);
|
|
|
}
|
|
@@ -385,32 +464,69 @@ int main(int argc, char **argv)
|
|
|
|
|
|
starpu_fxt_start_profiling();
|
|
|
start = starpu_timing_now();
|
|
|
+ starpu_sleep(1);
|
|
|
|
|
|
- unsigned x, y, iter;
|
|
|
+ unsigned x, y, z, iter;
|
|
|
for (iter = 0; iter < niter; iter++)
|
|
|
{
|
|
|
- for (x = 0; x < nslicesx; x++)
|
|
|
- for (y = 0; y < nslicesy; y++)
|
|
|
+ if (tiled)
|
|
|
+ {
|
|
|
+ for (x = 0; x < nslicesx; x++)
|
|
|
+ for (y = 0; y < nslicesy; y++)
|
|
|
+ {
|
|
|
+ starpu_data_handle_t Ctile = starpu_data_get_sub_data(C_handle, 2, x, y);
|
|
|
+ for (z = 0; z < nslicesz; z++)
|
|
|
+ {
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+
|
|
|
+ if (z == 0)
|
|
|
+ task->cl = &cl_gemm0;
|
|
|
+ else
|
|
|
+ task->cl = &cl_gemm;
|
|
|
+
|
|
|
+ task->handles[0] = starpu_data_get_sub_data(A_handle, 2, z, y);
|
|
|
+ task->handles[1] = starpu_data_get_sub_data(B_handle, 2, x, z);
|
|
|
+ task->handles[2] = Ctile;
|
|
|
+
|
|
|
+ task->flops = 2ULL * (xdim/nslicesx) * (ydim/nslicesy) * (zdim/nslicesz);
|
|
|
+
|
|
|
+ ret = starpu_task_submit(task);
|
|
|
+ if (ret == -ENODEV)
|
|
|
+ {
|
|
|
+ check = 0;
|
|
|
+ ret = 77;
|
|
|
+ goto enodev;
|
|
|
+ }
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
|
|
|
+ }
|
|
|
+ starpu_data_wont_use(Ctile);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ else
|
|
|
{
|
|
|
- struct starpu_task *task = starpu_task_create();
|
|
|
+ for (x = 0; x < nslicesx; x++)
|
|
|
+ for (y = 0; y < nslicesy; y++)
|
|
|
+ {
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
|
|
|
- task->cl = &cl;
|
|
|
+ task->cl = &cl_gemm0;
|
|
|
|
|
|
- task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
|
|
|
- task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
|
|
|
- task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
|
|
|
+ task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
|
|
|
+ task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
|
|
|
+ task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
|
|
|
|
|
|
- task->flops = 2ULL * (xdim/nslicesx) * (ydim/nslicesy) * zdim;
|
|
|
+ task->flops = 2ULL * (xdim/nslicesx) * (ydim/nslicesy) * zdim;
|
|
|
|
|
|
- ret = starpu_task_submit(task);
|
|
|
- if (ret == -ENODEV)
|
|
|
- {
|
|
|
- check = 0;
|
|
|
- ret = 77;
|
|
|
- goto enodev;
|
|
|
+ ret = starpu_task_submit(task);
|
|
|
+ if (ret == -ENODEV)
|
|
|
+ {
|
|
|
+ check = 0;
|
|
|
+ ret = 77;
|
|
|
+ goto enodev;
|
|
|
+ }
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
|
|
|
+ starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
|
|
|
}
|
|
|
- STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
|
|
|
- starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
|
|
|
}
|
|
|
|
|
|
starpu_task_wait_for_all();
|