|
@@ -1,6 +1,6 @@
|
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
*
|
|
|
- * Copyright (C) 2010 Université de Bordeaux 1
|
|
|
+ * Copyright (C) 2010, 2012 Université de Bordeaux 1
|
|
|
*
|
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
|
* it under the terms of the GNU Lesser General Public License as published by
|
|
@@ -16,6 +16,7 @@
|
|
|
|
|
|
#include "cg.h"
|
|
|
#include <math.h>
|
|
|
+#include <limits.h>
|
|
|
|
|
|
#if 0
|
|
|
static void print_vector_from_descr(unsigned nx, TYPE *v)
|
|
@@ -43,6 +44,23 @@ static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
|
|
|
+{
|
|
|
+ enum starpu_archtype type = starpu_worker_get_type(workerid);
|
|
|
+ if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
|
|
|
+ return 1;
|
|
|
+
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ /* Cuda device */
|
|
|
+ const struct cudaDeviceProp *props;
|
|
|
+ props = starpu_cuda_get_device_properties(workerid);
|
|
|
+ if (props->major >= 2 || props->minor >= 3)
|
|
|
+ /* At least compute capability 1.3, supports doubles */
|
|
|
+ return 1;
|
|
|
+#endif
|
|
|
+ /* Old card, does not support doubles */
|
|
|
+ return 0;
|
|
|
+}
|
|
|
|
|
|
/*
|
|
|
* Reduction accumulation methods
|
|
@@ -67,16 +85,19 @@ static void accumulate_variable_cpu(void *descr[], void *cl_arg)
|
|
|
*v_dst = *v_dst + *v_src;
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t accumulate_variable_model = {
|
|
|
+static struct starpu_perfmodel accumulate_variable_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "accumulate_variable"
|
|
|
};
|
|
|
|
|
|
-starpu_codelet accumulate_variable_cl = {
|
|
|
+struct starpu_codelet accumulate_variable_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = accumulate_variable_cpu,
|
|
|
+ .cpu_funcs = {accumulate_variable_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = accumulate_variable_cuda,
|
|
|
+ .cuda_funcs = {accumulate_variable_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 2,
|
|
|
.model = &accumulate_variable_model
|
|
@@ -103,16 +124,19 @@ static void accumulate_vector_cpu(void *descr[], void *cl_arg)
|
|
|
AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t accumulate_vector_model = {
|
|
|
+static struct starpu_perfmodel accumulate_vector_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "accumulate_vector"
|
|
|
};
|
|
|
|
|
|
-starpu_codelet accumulate_vector_cl = {
|
|
|
+struct starpu_codelet accumulate_vector_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = accumulate_vector_cpu,
|
|
|
+ .cpu_funcs = {accumulate_vector_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = accumulate_vector_cuda,
|
|
|
+ .cuda_funcs = {accumulate_vector_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 2,
|
|
|
.model = &accumulate_vector_model
|
|
@@ -141,16 +165,19 @@ static void bzero_variable_cpu(void *descr[], void *cl_arg)
|
|
|
*v = (TYPE)0.0;
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t bzero_variable_model = {
|
|
|
+static struct starpu_perfmodel bzero_variable_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "bzero_variable"
|
|
|
};
|
|
|
|
|
|
-starpu_codelet bzero_variable_cl = {
|
|
|
+struct starpu_codelet bzero_variable_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = bzero_variable_cpu,
|
|
|
+ .cpu_funcs = {bzero_variable_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = bzero_variable_cuda,
|
|
|
+ .cuda_funcs = {bzero_variable_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 1,
|
|
|
.model = &bzero_variable_model
|
|
@@ -176,16 +203,19 @@ static void bzero_vector_cpu(void *descr[], void *cl_arg)
|
|
|
memset(v, 0, n*sizeof(TYPE));
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t bzero_vector_model = {
|
|
|
+static struct starpu_perfmodel bzero_vector_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "bzero_vector"
|
|
|
};
|
|
|
|
|
|
-starpu_codelet bzero_vector_cl = {
|
|
|
+struct starpu_codelet bzero_vector_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = bzero_vector_cpu,
|
|
|
+ .cpu_funcs = {bzero_vector_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = bzero_vector_cuda,
|
|
|
+ .cuda_funcs = {bzero_vector_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 1,
|
|
|
.model = &bzero_vector_model
|
|
@@ -229,39 +259,48 @@ static void dot_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
*dot = *dot + local_dot;
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t dot_kernel_model = {
|
|
|
+static struct starpu_perfmodel dot_kernel_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "dot_kernel"
|
|
|
};
|
|
|
|
|
|
-static starpu_codelet dot_kernel_cl = {
|
|
|
+static struct starpu_codelet dot_kernel_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = dot_kernel_cpu,
|
|
|
+ .cpu_funcs = {dot_kernel_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = dot_kernel_cuda,
|
|
|
+ .cuda_funcs = {dot_kernel_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 3,
|
|
|
.model = &dot_kernel_model
|
|
|
};
|
|
|
|
|
|
-void dot_kernel(starpu_data_handle v1,
|
|
|
- starpu_data_handle v2,
|
|
|
- starpu_data_handle s,
|
|
|
- unsigned nblocks,
|
|
|
- int use_reduction)
|
|
|
+int dot_kernel(starpu_data_handle_t v1,
|
|
|
+ starpu_data_handle_t v2,
|
|
|
+ starpu_data_handle_t s,
|
|
|
+ unsigned nblocks,
|
|
|
+ int use_reduction)
|
|
|
{
|
|
|
+ int ret;
|
|
|
+
|
|
|
/* Blank the accumulation variable */
|
|
|
- starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
|
|
|
+ ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
|
|
|
+ if (ret == -ENODEV) return ret;
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
|
|
|
|
|
|
unsigned b;
|
|
|
for (b = 0; b < nblocks; b++)
|
|
|
{
|
|
|
- starpu_insert_task(&dot_kernel_cl,
|
|
|
- use_reduction?STARPU_REDUX:STARPU_RW, s,
|
|
|
- STARPU_R, starpu_data_get_sub_data(v1, 1, b),
|
|
|
- STARPU_R, starpu_data_get_sub_data(v2, 1, b),
|
|
|
- 0);
|
|
|
+ ret = starpu_insert_task(&dot_kernel_cl,
|
|
|
+ use_reduction?STARPU_REDUX:STARPU_RW, s,
|
|
|
+ STARPU_R, starpu_data_get_sub_data(v1, 1, b),
|
|
|
+ STARPU_R, starpu_data_get_sub_data(v2, 1, b),
|
|
|
+ 0);
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
|
|
|
}
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -272,7 +311,7 @@ void dot_kernel(starpu_data_handle v1,
|
|
|
static void scal_kernel_cuda(void *descr[], void *cl_arg)
|
|
|
{
|
|
|
TYPE p1;
|
|
|
- starpu_unpack_cl_args(cl_arg, &p1);
|
|
|
+ starpu_codelet_unpack_args(cl_arg, &p1);
|
|
|
|
|
|
TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
|
|
@@ -287,7 +326,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
|
|
|
static void scal_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
{
|
|
|
TYPE alpha;
|
|
|
- starpu_unpack_cl_args(cl_arg, &alpha);
|
|
|
+ starpu_codelet_unpack_args(cl_arg, &alpha);
|
|
|
|
|
|
TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
|
|
@@ -296,16 +335,19 @@ static void scal_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
SCAL(n, alpha, v1, 1);
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t scal_kernel_model = {
|
|
|
+static struct starpu_perfmodel scal_kernel_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "scal_kernel"
|
|
|
};
|
|
|
|
|
|
-static starpu_codelet scal_kernel_cl = {
|
|
|
+static struct starpu_codelet scal_kernel_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = scal_kernel_cpu,
|
|
|
+ .cpu_funcs = {scal_kernel_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = scal_kernel_cuda,
|
|
|
+ .cuda_funcs = {scal_kernel_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 1,
|
|
|
.model = &scal_kernel_model
|
|
@@ -327,7 +369,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
|
|
|
unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
|
|
|
|
|
|
TYPE alpha, beta;
|
|
|
- starpu_unpack_cl_args(cl_arg, &beta, &alpha);
|
|
|
+ starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
|
|
|
|
|
|
/* Compute v1 = alpha M v2 + beta v1 */
|
|
|
cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
|
|
@@ -346,7 +388,7 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
|
|
|
|
|
|
TYPE alpha, beta;
|
|
|
- starpu_unpack_cl_args(cl_arg, &beta, &alpha);
|
|
|
+ starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
|
|
|
|
|
|
int worker_size = starpu_combined_worker_get_size();
|
|
|
|
|
@@ -367,38 +409,44 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t gemv_kernel_model = {
|
|
|
+static struct starpu_perfmodel gemv_kernel_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "gemv_kernel"
|
|
|
};
|
|
|
|
|
|
-static starpu_codelet gemv_kernel_cl = {
|
|
|
+static struct starpu_codelet gemv_kernel_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
.type = STARPU_SPMD,
|
|
|
.max_parallelism = INT_MAX,
|
|
|
- .cpu_func = gemv_kernel_cpu,
|
|
|
+ .cpu_funcs = {gemv_kernel_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = gemv_kernel_cuda,
|
|
|
+ .cuda_funcs = {gemv_kernel_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 3,
|
|
|
.model = &gemv_kernel_model
|
|
|
};
|
|
|
|
|
|
-void gemv_kernel(starpu_data_handle v1,
|
|
|
- starpu_data_handle matrix,
|
|
|
- starpu_data_handle v2,
|
|
|
+int gemv_kernel(starpu_data_handle_t v1,
|
|
|
+ starpu_data_handle_t matrix,
|
|
|
+ starpu_data_handle_t v2,
|
|
|
TYPE p1, TYPE p2,
|
|
|
unsigned nblocks,
|
|
|
int use_reduction)
|
|
|
{
|
|
|
unsigned b1, b2;
|
|
|
+ int ret;
|
|
|
|
|
|
for (b2 = 0; b2 < nblocks; b2++)
|
|
|
{
|
|
|
- starpu_insert_task(&scal_kernel_cl,
|
|
|
- STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
|
|
|
- STARPU_VALUE, &p1, sizeof(p1),
|
|
|
- 0);
|
|
|
+ ret = starpu_insert_task(&scal_kernel_cl,
|
|
|
+ STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
|
|
|
+ STARPU_VALUE, &p1, sizeof(p1),
|
|
|
+ 0);
|
|
|
+ if (ret == -ENODEV) return ret;
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
|
|
|
}
|
|
|
|
|
|
for (b2 = 0; b2 < nblocks; b2++)
|
|
@@ -406,15 +454,17 @@ void gemv_kernel(starpu_data_handle v1,
|
|
|
for (b1 = 0; b1 < nblocks; b1++)
|
|
|
{
|
|
|
TYPE one = 1.0;
|
|
|
- starpu_insert_task(&gemv_kernel_cl,
|
|
|
- use_reduction?STARPU_REDUX:STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
|
|
|
- STARPU_R, starpu_data_get_sub_data(matrix, 2, b2, b1),
|
|
|
- STARPU_R, starpu_data_get_sub_data(v2, 1, b1),
|
|
|
- STARPU_VALUE, &one, sizeof(one),
|
|
|
- STARPU_VALUE, &p2, sizeof(p2),
|
|
|
- 0);
|
|
|
+ ret = starpu_insert_task(&gemv_kernel_cl,
|
|
|
+ use_reduction?STARPU_REDUX:STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
|
|
|
+ STARPU_R, starpu_data_get_sub_data(matrix, 2, b2, b1),
|
|
|
+ STARPU_R, starpu_data_get_sub_data(v2, 1, b1),
|
|
|
+ STARPU_VALUE, &one, sizeof(one),
|
|
|
+ STARPU_VALUE, &p2, sizeof(p2),
|
|
|
+ 0);
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
|
|
|
}
|
|
|
}
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -424,7 +474,7 @@ void gemv_kernel(starpu_data_handle v1,
|
|
|
static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
|
|
|
{
|
|
|
TYPE p1, p2;
|
|
|
- starpu_unpack_cl_args(cl_arg, &p1, &p2);
|
|
|
+ starpu_codelet_unpack_args(cl_arg, &p1, &p2);
|
|
|
|
|
|
TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
|
|
@@ -444,7 +494,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
|
|
|
static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
{
|
|
|
TYPE p1, p2;
|
|
|
- starpu_unpack_cl_args(cl_arg, &p1, &p2);
|
|
|
+ starpu_codelet_unpack_args(cl_arg, &p1, &p2);
|
|
|
|
|
|
TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
|
|
@@ -459,35 +509,42 @@ static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
AXPY(nx, p2, v2, 1, v1, 1);
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t scal_axpy_kernel_model = {
|
|
|
+static struct starpu_perfmodel scal_axpy_kernel_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "scal_axpy_kernel"
|
|
|
};
|
|
|
|
|
|
-static starpu_codelet scal_axpy_kernel_cl = {
|
|
|
+static struct starpu_codelet scal_axpy_kernel_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = scal_axpy_kernel_cpu,
|
|
|
+ .cpu_funcs = {scal_axpy_kernel_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = scal_axpy_kernel_cuda,
|
|
|
+ .cuda_funcs = {scal_axpy_kernel_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 2,
|
|
|
.model = &scal_axpy_kernel_model
|
|
|
};
|
|
|
|
|
|
-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
|
|
|
- starpu_data_handle v2, TYPE p2,
|
|
|
- unsigned nblocks)
|
|
|
+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
|
|
|
+ starpu_data_handle_t v2, TYPE p2,
|
|
|
+ unsigned nblocks)
|
|
|
{
|
|
|
+ int ret;
|
|
|
unsigned b;
|
|
|
for (b = 0; b < nblocks; b++)
|
|
|
{
|
|
|
- starpu_insert_task(&scal_axpy_kernel_cl,
|
|
|
- STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
|
|
|
- STARPU_R, starpu_data_get_sub_data(v2, 1, b),
|
|
|
- STARPU_VALUE, &p1, sizeof(p1),
|
|
|
- STARPU_VALUE, &p2, sizeof(p2),
|
|
|
- 0);
|
|
|
+ ret = starpu_insert_task(&scal_axpy_kernel_cl,
|
|
|
+ STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
|
|
|
+ STARPU_R, starpu_data_get_sub_data(v2, 1, b),
|
|
|
+ STARPU_VALUE, &p1, sizeof(p1),
|
|
|
+ STARPU_VALUE, &p2, sizeof(p2),
|
|
|
+ 0);
|
|
|
+ if (ret == -ENODEV) return ret;
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
|
|
|
}
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
|
|
@@ -498,7 +555,7 @@ void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
|
|
|
static void axpy_kernel_cuda(void *descr[], void *cl_arg)
|
|
|
{
|
|
|
TYPE p1;
|
|
|
- starpu_unpack_cl_args(cl_arg, &p1);
|
|
|
+ starpu_codelet_unpack_args(cl_arg, &p1);
|
|
|
|
|
|
TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
|
|
@@ -515,7 +572,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
|
|
|
static void axpy_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
{
|
|
|
TYPE p1;
|
|
|
- starpu_unpack_cl_args(cl_arg, &p1);
|
|
|
+ starpu_codelet_unpack_args(cl_arg, &p1);
|
|
|
|
|
|
TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
|
|
@@ -527,89 +584,47 @@ static void axpy_kernel_cpu(void *descr[], void *cl_arg)
|
|
|
AXPY(nx, p1, v2, 1, v1, 1);
|
|
|
}
|
|
|
|
|
|
-static struct starpu_perfmodel_t axpy_kernel_model = {
|
|
|
+static struct starpu_perfmodel axpy_kernel_model =
|
|
|
+{
|
|
|
.type = STARPU_HISTORY_BASED,
|
|
|
.symbol = "axpy_kernel"
|
|
|
};
|
|
|
|
|
|
-static starpu_codelet axpy_kernel_cl = {
|
|
|
+static struct starpu_codelet axpy_kernel_cl =
|
|
|
+{
|
|
|
+ .can_execute = can_execute,
|
|
|
.where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = axpy_kernel_cpu,
|
|
|
+ .cpu_funcs = {axpy_kernel_cpu, NULL},
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = axpy_kernel_cuda,
|
|
|
+ .cuda_funcs = {axpy_kernel_cuda, NULL},
|
|
|
#endif
|
|
|
.nbuffers = 2,
|
|
|
.model = &axpy_kernel_model
|
|
|
};
|
|
|
|
|
|
-void axpy_kernel(starpu_data_handle v1,
|
|
|
- starpu_data_handle v2, TYPE p1,
|
|
|
+int axpy_kernel(starpu_data_handle_t v1,
|
|
|
+ starpu_data_handle_t v2, TYPE p1,
|
|
|
unsigned nblocks)
|
|
|
{
|
|
|
+ int ret;
|
|
|
unsigned b;
|
|
|
for (b = 0; b < nblocks; b++)
|
|
|
{
|
|
|
- starpu_insert_task(&axpy_kernel_cl,
|
|
|
- STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
|
|
|
- STARPU_R, starpu_data_get_sub_data(v2, 1, b),
|
|
|
- STARPU_VALUE, &p1, sizeof(p1),
|
|
|
- 0);
|
|
|
+ ret = starpu_insert_task(&axpy_kernel_cl,
|
|
|
+ STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
|
|
|
+ STARPU_R, starpu_data_get_sub_data(v2, 1, b),
|
|
|
+ STARPU_VALUE, &p1, sizeof(p1),
|
|
|
+ 0);
|
|
|
+ if (ret == -ENODEV) return ret;
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
|
|
|
}
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-/*
|
|
|
- * COPY kernel : vector_dst <- vector_src
|
|
|
- */
|
|
|
-
|
|
|
-static void copy_handle_cpu(void *descr[], void *cl_arg)
|
|
|
-{
|
|
|
- TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
- TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
|
|
|
-
|
|
|
- unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
|
|
|
- size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
|
|
|
-
|
|
|
- memcpy(dst, src, nx*elemsize);
|
|
|
-}
|
|
|
-
|
|
|
-#ifdef STARPU_USE_CUDA
|
|
|
-static void copy_handle_cuda(void *descr[], void *cl_arg)
|
|
|
-{
|
|
|
- TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
- TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
|
|
|
-
|
|
|
- unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
|
|
|
- size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
|
|
|
-
|
|
|
- cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
|
|
|
- cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
-static struct starpu_perfmodel_t copy_handle_model = {
|
|
|
- .type = STARPU_HISTORY_BASED,
|
|
|
- .symbol = "copy_handle"
|
|
|
-};
|
|
|
-
|
|
|
-static starpu_codelet copy_handle_cl = {
|
|
|
- .where = STARPU_CPU|STARPU_CUDA,
|
|
|
- .cpu_func = copy_handle_cpu,
|
|
|
-#ifdef STARPU_USE_CUDA
|
|
|
- .cuda_func = copy_handle_cuda,
|
|
|
-#endif
|
|
|
- .nbuffers = 2,
|
|
|
- .model = ©_handle_model
|
|
|
-};
|
|
|
-
|
|
|
-void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
|
|
|
+int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
|
|
|
{
|
|
|
unsigned b;
|
|
|
for (b = 0; b < nblocks; b++)
|
|
|
- {
|
|
|
- starpu_insert_task(©_handle_cl,
|
|
|
- STARPU_W, starpu_data_get_sub_data(dst, 1, b),
|
|
|
- STARPU_R, starpu_data_get_sub_data(src, 1, b),
|
|
|
- 0);
|
|
|
- }
|
|
|
-}
|
|
|
+ starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
|
|
|
+ return 0;
|
|
|
+}
|