Procházet zdrojové kódy

reduction: Add missing commute flag

Reductions really need the operator to commute, so better set the flag, and
this can actually improve efficiency in the reduction.

Keep accepting not including the commute flag, but warn about it.
Samuel Thibault před 4 roky
rodič
revize
18f21ba151

+ 1 - 0
ChangeLog

@@ -54,6 +54,7 @@ New features:
 
 Small changes:
   * Add a synthetic energy efficiency testcase.
+  * Make reduction methods want the commute flag.
 
 StarPU 1.3.8
 ====================================================================

+ 2 - 1
doc/doxygen/chapters/310_data_management.doxy

@@ -643,7 +643,8 @@ struct starpu_codelet accumulate_variable_cl =
         .cpu_funcs = { accumulate_variable_cpu },
         .cpu_funcs_name = { "accumulate_variable_cpu" },
         .cuda_funcs = { accumulate_variable_cuda },
-        .nbuffers = 1,
+        .nbuffers = 2,
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 }
 \endcode
 

+ 2 - 2
examples/cg/cg_kernels.c

@@ -120,7 +120,7 @@ struct starpu_codelet accumulate_variable_cl =
 	.cuda_funcs = {accumulate_variable_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.model = &accumulate_variable_model
 };
@@ -164,7 +164,7 @@ struct starpu_codelet accumulate_vector_cl =
 	.cuda_funcs = {accumulate_vector_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.model = &accumulate_vector_model
 };

+ 1 - 1
examples/pi/pi_redux.c

@@ -322,7 +322,7 @@ static struct starpu_codelet redux_codelet =
 	.cuda_funcs = {redux_cuda_func},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2
 };
 

+ 1 - 1
examples/reductions/dot_product.c

@@ -211,7 +211,7 @@ static struct starpu_codelet redux_codelet =
 	.opencl_funcs = {redux_opencl_func},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.name = "redux"
 };

+ 1 - 1
examples/reductions/minmax_reduction.c

@@ -95,7 +95,7 @@ static struct starpu_codelet minmax_redux_codelet =
 {
 	.cpu_funcs = {minmax_redux_cpu_func},
 	.cpu_funcs_name = {"minmax_redux_cpu_func"},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.name = "redux"
 };

+ 4 - 2
include/starpu_data.h

@@ -560,8 +560,10 @@ struct starpu_codelet;
 /**
    Set the codelets to be used for \p handle when it is accessed in the
    mode ::STARPU_REDUX. Per-worker buffers will be initialized with
-   the codelet \p init_cl, and reduction between per-worker buffers will be
-   done with the codelet \p redux_cl.
+   the codelet \p init_cl (which has to take one handle with STARPU_W), and
+   reduction between per-worker buffers will be done with the codelet \p
+   redux_cl (which has to take a first accumulation handle with
+   STARPU_RW|STARPU_COMMUTE, and a second contribution handle with STARPU_R).
 */
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl);
 

+ 1 - 1
mpi/tests/mpi_reduction.c

@@ -37,7 +37,7 @@ static struct starpu_codelet init_codelet =
 static struct starpu_codelet redux_codelet =
 {
 	.cpu_funcs = {redux_cpu_func},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 #ifdef STARPU_SIMGRID
 	.model = &starpu_perfmodel_nop,

+ 11 - 2
src/datawizard/reduction.c

@@ -280,12 +280,21 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 					redux_task->cl = handle->redux_cl;
 					STARPU_ASSERT(redux_task->cl);
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
-						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW, 0);
+						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW|STARPU_COMMUTE, 0);
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1)))
 						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
 
-					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
+					STARPU_ASSERT_MSG((STARPU_CODELET_GET_MODE(redux_task->cl, 0) & ~STARPU_COMMUTE) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
 					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet %p has to be R", redux_task->cl);
+					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0) & STARPU_COMMUTE))
+					{
+						static int warned;
+						if (!warned)
+						{
+							warned = 1;
+							_STARPU_DISP("Warning: for reductions, codelet %p should have STARPU_COMMUTE along STARPU_RW\n", redux_task->cl);
+						}
+					}
 
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);

+ 1 - 1
tests/datawizard/increment_redux.c

@@ -137,7 +137,7 @@ static struct starpu_codelet redux_cl =
 #endif
 	.cpu_funcs = {redux_cpu_kernel},
 	.cpu_funcs_name = {"redux_cpu_kernel"},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2
 };
 

+ 1 - 1
tests/datawizard/increment_redux_lazy.c

@@ -125,7 +125,7 @@ static struct starpu_codelet redux_cl =
 #endif
 	.cpu_funcs = {redux_cpu_kernel},
 	.cpu_funcs_name = {"redux_cpu_kernel"},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2
 };
 

+ 1 - 1
tests/datawizard/increment_redux_v2.c

@@ -138,7 +138,7 @@ static struct starpu_codelet redux_cl =
 #endif
 	.cpu_funcs = {redux_cpu_kernel},
 	.cpu_funcs_name = {"redux_cpu_kernel"},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2
 };
 

+ 1 - 1
tests/datawizard/redux_acquire.c

@@ -45,7 +45,7 @@ static struct starpu_codelet init_codelet =
 static struct starpu_codelet redux_codelet =
 {
 	.cpu_funcs = {redux_cpu_func},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.name = "redux_codelet"
 };

+ 1 - 1
tests/microbenchs/parallel_redux_heterogeneous_tasks_data.c

@@ -127,7 +127,7 @@ static struct starpu_codelet cl_redux =
 	.opencl_funcs = { wait_OPENCL },
 	.cpu_funcs_name = { "wait_CPU" },
 	.nbuffers = 2,
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.flags = STARPU_CODELET_SIMGRID_EXECUTE,
 	.model = &perf_model_redux,
 	.name = "redux",

+ 1 - 1
tests/microbenchs/parallel_redux_homogeneous_tasks_data.c

@@ -96,7 +96,7 @@ static struct starpu_codelet cl_redux =
 	.opencl_funcs = { wait_homogeneous },
 	.cpu_funcs_name = { "wait_homogeneous" },
 	.nbuffers = 2,
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.flags = STARPU_CODELET_SIMGRID_EXECUTE,
 	.model = &perf_model_redux,
 	.name = "redux",