mariem makni vor 5 Jahren
Ursprung
Commit
6ab7f29c36

+ 90 - 4
doc/doxygen/chapters/310_data_management.doxy

@@ -54,6 +54,13 @@ starpu_data_handle_t vector_handle;
 starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 \endcode
 
+Vectors can be partitioned into pieces by using
+starpu_vector_filter_block(). They can also be partitioned with some overlapping
+by using starpu_vector_filter_block_shadow(). By default StarPU
+uses the same size for each piece. If different sizes are desired,
+starpu_vector_filter_list() or starpu_vector_filter_list_long() can be used
+instead. To just divide in two pieces, starpu_vector_filter_divide_in_2() can be used.
+
 \subsection MatrixDataInterface Matrix Data Interface
 
 To register 2-D matrices with a potential padding, one can use the
@@ -67,9 +74,15 @@ matrix = (float*)malloc(width * height * sizeof(float));
 starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
 \endcode
 
+2D matrices can be partitioned into 2D matrices along the x dimension by
+using starpu_matrix_filter_block(), and along the y dimension by using
+starpu_matrix_filter_vertical_block(). They can also be partitioned
+with some overlapping by using starpu_matrix_filter_block_shadow() and
+starpu_matrix_filter_vertical_block_shadow().
+
 \subsection BlockDataInterface Block Data Interface
 
-To register 3-D blocks with potential paddings on Y and Z dimensions,
+To register 3-D matrices with potential paddings on Y and Z dimensions,
 one can use the block data interface. Here an example of how to
 register a block data to StarPU by using starpu_block_data_register().
 
@@ -80,6 +93,14 @@ block = (float*)malloc(nx*ny*nz*sizeof(float));
 starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
 \endcode
 
+3D matrices can be partitioned along the x dimension by
+using starpu_block_filter_block(), or along the y dimension
+by using starpu_block_filter_vertical_block, or along the
+z dimension by using starpu_block_filter_depth_block. They
+can also be partitioned with some overlapping by using
+starpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),
+or starpu_block_filter_depth_block_shadow().
+
 \subsection BCSRDataInterface BCSR Data Interface
 
 BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
@@ -147,10 +168,16 @@ starpu_bcsr_data_register(&bcsr_handle,
 StarPU provides an example on how to deal with such matrices in
 <c>examples/spmv</c>.
 
+BCSR data handles can be partitioned into its dense matrix blocks by using
+starpu_bcsr_filter_canonical_block().
+
 \subsection CSRDataInterface CSR Data Interface
 
 TODO
 
+CSR data handles can be partitioned into vertical CSR matrices by using
+starpu_csr_filter_vertical_block().
+
 \subsection VariableSizeDataInterface Data Interface with Variable Size
 
 Tasks are actually allowed to change the size of data interfaces.
@@ -763,7 +790,11 @@ A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
 
 \section DefiningANewDataInterface Defining A New Data Interface
 
-Let's define a new data interface to manage complex numbers.
+This section proposes an example how to define your own interface, when the
+StarPU-provided interface do not fit your needs. Here we take a dumb example of
+an array of complex numbers represented by two arrays of double values.
+
+Let's thus define a new data interface to manage arrays of complex numbers:
 
 \code{.c}
 /* interface for complex numbers */
@@ -775,6 +806,15 @@ struct starpu_complex_interface
 };
 \endcode
 
+That structure stores enough to describe <b>one</b> buffer of such kind of
+data. It is used for the buffer stored in the main memory, another instance
+is used for the buffer stored in a GPU, etc. A <i>data handle</i> is thus a
+collection of such structures, to remember each buffer on each memory node.
+
+Note: one should not take pointers into such structures, because StarPU needs
+to be able to copy over the content of it to various places, for instance to
+efficiently migrate a data buffer from one data handle to another data handle.
+
 Registering such a data to StarPU is easily done using the function
 starpu_data_register(). The last
 parameter of the function, <c>interface_complex_ops</c>, will be
@@ -800,12 +840,41 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
 }
 \endcode
 
-The <c>starpu_complex_interface</c> structure is here used just to store the
+The <c>struct starpu_complex_interface complex</c> is here used just to store the
 parameters that the user provided to <c>starpu_complex_data_register</c>.
 starpu_data_register() will first allocate the handle, and
 then pass the <c>starpu_complex_interface</c> structure to the
 starpu_data_interface_ops::register_data_handle method, which records them
-within the data handle (it is called once per node by starpu_data_register()).
+within the data handle (it is called once per node by starpu_data_register()):
+
+\code{.c}
+static void complex_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
+{
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
+
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		struct starpu_complex_interface *local_interface = (struct starpu_complex_interface *)
+			starpu_data_get_interface_on_node(handle, node);
+
+		local_interface->nx = complex_interface->nx;
+		if (node == home_node)
+		{
+			local_interface->real = complex_interface->real;
+			local_interface->imaginary = complex_interface->imaginary;
+		}
+		else
+		{
+			local_interface->real = NULL;
+			local_interface->imaginary = NULL;
+		}
+	}
+}
+\endcode
+
+If the application provided a home node, the corresponding pointers will be
+recorded for that node. Others have no buffer allocated yet.
 
 Different operations need to be defined for a data interface through
 the type starpu_data_interface_ops. We only define here the basic
@@ -932,4 +1001,21 @@ when the kernel does not make so many accesses to the second data, and thus data
 being remote e.g. over a PCI bus is not a performance problem, and avoids
 filling the fast local memory with data which does not need the performance.
 
+In cases where the kernel is fine with some data being either local or in the
+main memory, ::STARPU_SPECIFIC_NODE_LOCAL_OR_CPU can be used. StarPU will then
+be free to leave the data in the main memory and let the kernel access it from
+accelerators, or to move it to the accelerator before starting the kernel, for
+instance:
+
+\code{.c}
+struct starpu_codelet cl =
+{
+	.cuda_funcs = { kernel },
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R},
+	.specific_nodes = 1,
+	.nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
+};
+\endcode
+
 */

+ 8 - 0
doc/doxygen/chapters/440_fpga_support.doxy

@@ -217,6 +217,14 @@ In the <c>main</c> function, there are four important steps:
 
 The rest of the application (data registration, task submission, etc.) is as usual with StarPU
 
+\subsection FPGADataTransfers Data Transfers in StarPU/FPGA Applications
+
+The communication between the host and the DFE is done through the <c>Dynamic advance interface</c> to exchange data between the main memory and the local memory of the DFE.
+For instant, we use \ref STARPU_MAIN_RAM to send and store data to/from DFE's local memory. However, we aim to use a multiplexer to choose which memory node we will use to read/write data. So, the user can tell that the computational kernel will take data from the main memory or DFE's local memory for example.
+
+In starPU applications, When \ref starpu_codelet::specific_nodes is 1, this specifies the memory nodes where each data should be sent to for task execution.
+  
+
 \subsection FPGAConfiguration FPGA Configuration
 
 To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c> through the \c configure option <b>"--with-fpga"</b>.

+ 30 - 5
include/starpu_task.h

@@ -49,8 +49,7 @@ extern "C"
 */
 #define STARPU_NOWHERE	((1ULL)<<0)
 
-/**
-   To be used when setting the field starpu_codelet::where (or
+/**   To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
    executed on a CPU processing unit.
 */
@@ -243,15 +242,41 @@ typedef starpu_mpi_ms_kernel_t (*starpu_mpi_ms_func_t)(void);
 #define STARPU_VARIABLE_NBUFFERS (-1)
 
 /**
-   Value to be set in the field starpu_codelet::nodes to request
-   StarPU to put the data in CPU-accessible memory (and let StarPU
-   choose the NUMA node).
+   Value to be set in the starpu_codelet::nodes field to request
+   StarPU to put the data in local memory of the worker running the task (this
+   is the default behavior).
 */
 #define STARPU_SPECIFIC_NODE_LOCAL (-1)
+
+/**
+    Value to be set in the starpu_codelet::nodes field to request
+    StarPU to put the data in CPU-accessible memory (and let StarPU
+    choose the NUMA node).
+*/
+
 #define STARPU_SPECIFIC_NODE_CPU (-2)
+
+/**
+    Value to be set in the starpu_codelet::nodes field to request
+    StarPU to put the data in some slow memory.
+*/
+
 #define STARPU_SPECIFIC_NODE_SLOW (-3)
+/**
+   Value to be set in the starpu_codelet::nodes field to request
+   StarPU to put the data in some fast memory.
+*/
+
 #define STARPU_SPECIFIC_NODE_FAST (-4)
 
+/**
+    Value to be set in the starpu_codelet::nodes field to let StarPU decide
+    whether to put the data in the local memory of the worker running the task,
+    or in CPU-accessible memory (and let StarPU choose the NUMA node).
+*/
+
+#define STARPU_SPECIFIC_NODE_LOCAL_OR_CPU (-5)
+
 struct starpu_task;
 
 /**

+ 1 - 1
src/core/task.c

@@ -795,7 +795,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 			/* Make sure handles are not partitioned */
 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
 			/* Make sure the specified node exists */
-			STARPU_ASSERT_MSG(node == STARPU_SPECIFIC_NODE_LOCAL || node == STARPU_SPECIFIC_NODE_CPU || node == STARPU_SPECIFIC_NODE_SLOW || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
+			STARPU_ASSERT_MSG(node == STARPU_SPECIFIC_NODE_LOCAL || node == STARPU_SPECIFIC_NODE_CPU || node == STARPU_SPECIFIC_NODE_SLOW || node == STARPU_SPECIFIC_NODE_LOCAL_OR_CPU || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
 			/* Provide the home interface for now if any,
 			 * for can_execute hooks */
 			if (handle->home_node != -1)

+ 19 - 0
src/core/topology.c

@@ -319,6 +319,16 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 		// TODO: rather leave in DDR
 		node = local_node;
 		break;
+        case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+			   
+			node = local_node;
+		} else {
+                        /* It is not here already, do not bother moving it */
+                        node = STARPU_MAIN_RAM;
+                    }
+		break;
+      
 	}
 	return node;
 }
@@ -343,6 +353,15 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 		// TODO: rather leave in DDR
 		node = local_node;
 		break;
+        case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
+                if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+                /* It is here already, rather access it from here */
+                node = local_node;
+                 } else {
+                /* It is not here already, do not bother moving it */
+                node = STARPU_MAIN_RAM;
+                 }
+                break;
 	}
 	return node;
 }

+ 43 - 2
tests/datawizard/specific_node.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015,2017                      CNRS
- * Copyright (C) 2010,2011,2013,2014,2016-2018            Université de Bordeaux
+ * Copyright (C) 2010,2011,2013,2014,2016-2019            Université de Bordeaux
  * Copyright (C) 2012,2017                                Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -32,6 +32,40 @@
 
 unsigned data, data2;
 
+void specific2_kernel(void *descr[], void *arg)
+{
+	(void)arg;
+	int node = starpu_task_get_current_data_node(0);
+	STARPU_ASSERT(node >= 0);
+	STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM);
+	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	if (node == STARPU_MAIN_RAM)
+		STARPU_ASSERT(dataptr == &data);
+
+	(*dataptr)++;
+
+	node = starpu_task_get_current_data_node(1);
+	STARPU_ASSERT(node >= 0);
+	STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM
+			|| (unsigned) node == starpu_worker_get_local_memory_node());
+	dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	if (node == STARPU_MAIN_RAM)
+		STARPU_ASSERT(dataptr == &data2);
+}
+
+static struct starpu_codelet specific2_cl =
+{
+	.cpu_funcs = {specific2_kernel},
+	.cuda_funcs = {specific2_kernel},
+	.opencl_funcs = {specific2_kernel},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+	.specific_nodes = 1,
+	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
+};
+
 void specific_kernel(void *descr[], void *arg)
 {
 	(void)arg;
@@ -42,7 +76,12 @@ void specific_kernel(void *descr[], void *arg)
 
 	if (node == STARPU_MAIN_RAM)
 		STARPU_ASSERT(dataptr == &data);
+
 	(*dataptr)++;
+
+
+	node = starpu_task_get_current_data_node(1);
+	STARPU_ASSERT((unsigned) node == starpu_worker_get_local_memory_node());
 }
 
 static struct starpu_codelet specific_cl =
@@ -123,8 +162,10 @@ int main(void)
 	for (i = 0; i < ntasks; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
-		if (i%2)
+		if (i%3 == 0)
 			task->cl = &specific_cl;
+		else if (i%3 == 1)
+			task->cl = &specific2_cl;
 		else
 			task->cl = &cl;
 		task->handles[0] = data_handle;

+ 90 - 51
tests/perfmodels/max_fpga.c

@@ -1,53 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2013,2015,2017                      CNRS
+ * Copyright (C) 2010,2011,2013,2014,2016-2018            Université de Bordeaux
+ * Copyright (C) 2012,2017                                Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Test using the specific_nodes field by forcing the data to main memory
+ * even if the task is run on a GPU (and actually doing the computation from
+ * the CPU driving the GPU). It mixes such accesses and normal accesses from
+ * the GPU
+ */
+
 #include <starpu.h>
 #include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
 #include <stdio.h>
 #include <starpu_scheduler.h>
 #include "../helper.h"
 
 #include "StreamFMA.h"
 #include "MaxSLiCInterface.h"
+
 #define SIZE 128
 
 static max_engine_t *engine ;
 static max_actions_t*act;
 static max_file_t *maxfile;
 
-void cpu_func(void *buffers[], void *cl_arg)
+void specific_kernel(void *descr[], void *arg)
 {
-    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
-    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
-    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
+    (void)arg;
+    int *a = (int*) STARPU_VECTOR_GET_PTR(descr[0]);
+    int *b = (int*) STARPU_VECTOR_GET_PTR(descr[1]);
+    int *c = (int*) STARPU_VECTOR_GET_PTR(descr[2]);
 
-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+    int node = starpu_task_get_current_data_node(0);
+    STARPU_ASSERT(node >= 0);
+    STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM);
 
-    (void)buffers;
-    (void)cl_arg;
+    unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
 
-    int i;
-    for (i = 0; i < size; i++)
-	c[i] = a[i] + b[i];
+   // int i;
+   // for (i = 0; i < size; i++)
+     //   c[i] = a[i]+b[i];
+
+    if (node == STARPU_MAIN_RAM)
+        STARPU_ASSERT(dataptr == &c);
+    (*dataptr)++;
 }
 
-void fpga_mult(void *buffers[], void *cl_arg)
-{   
-    (void)cl_arg;
-    
-    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
-    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
-    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
+void fpga_mult(void *descr[], void *arg)
+{
+    (void)arg;
+
+    int *a = (int*) STARPU_VECTOR_GET_PTR(descr[0]);
+    int *b = (int*) STARPU_VECTOR_GET_PTR(descr[1]);
+    int *c = (int*) STARPU_VECTOR_GET_PTR(descr[2]);
 
-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+    int size = STARPU_VECTOR_GET_NX(descr[0]);
+
+    int node = starpu_task_get_current_data_node(2);
+    
+    if (node == STARPU_MAIN_RAM)
+     {
+          printf("hellooooo ...\n");
+     }
+    else {
+     printf("byeee....\n");
+    }
 
     //Actions to run on an engine
     act = max_actions_init(maxfile, NULL);
 
     //set the number of ticks for a kernel
-    max_set_ticks  (act, "StreamFMAKernel", size);
+    max_set_ticks (act, "StreamFMAKernel", size);
+
+    max_queue_input(act, "a", a, size *sizeof(a[0]));
 
-    max_queue_input(act, "a", a, size *sizeof(a[0])); 
     max_queue_input(act, "b", b, size*sizeof(b[0]));
     max_queue_output(act,"output", c, size*sizeof(c[0]));
 
+
     //run actions on the engine
     printf("Running on DFE using dynamic interface ...\n");
 
@@ -57,20 +103,19 @@ void fpga_mult(void *buffers[], void *cl_arg)
     max_run_t *run0= max_run_nonblock(engine, act);
 
     printf("*** wait for the actions on DFE to complete *** \n");
+    //wait for the actions to complete
     max_wait(run0);
-   // max_run(engine, act);
-    
-  }
 
-static struct starpu_codelet cl =
+}
+
+static struct starpu_codelet specific_cl =
 {
-    .cpu_funcs = {cpu_func},
-    .cpu_funcs_name = {"cpu_func"},
-//#ifdef STARPU_USE_FPGA
+    .cpu_funcs = {specific_kernel},
     .fpga_funcs = {fpga_mult},
-//#endif
     .nbuffers = 3,
-    .modes = {STARPU_R, STARPU_R, STARPU_W}
+    .modes = {STARPU_R,STARPU_R, STARPU_W},
+    .specific_nodes = 1,
+    .nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
 };
 
 
@@ -95,7 +140,7 @@ int main(int argc, char **argv)
     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
     //Implementation of a maxfile
-     maxfile = StreamFMA_init();
+    maxfile = StreamFMA_init();
 
     //Implementation of an engine
     engine = max_load(maxfile, "*");
@@ -115,46 +160,40 @@ int main(int argc, char **argv)
     starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(int));
     starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(int));
 
+
     struct starpu_task *task = starpu_task_create();
-    task->cl = &cl;
+
+    task->cl = &specific_cl;
     task->handles[0] = handle_a;
     task->handles[1] = handle_b;
     task->handles[2] = handle_c;
     
     task->synchronous = 1;
     task->destroy = 0;
-    /* submit the task to StarPU */
-
-    //starpu_task_destroy(task);
     ret = starpu_task_submit(task);
-    
+
     fprintf(stderr,"task submitted %d\n", ret);
 
     starpu_data_unregister(handle_a);
     starpu_data_unregister(handle_b);
     starpu_data_unregister(handle_c);
-  
-	int mysize = SIZE;
-	if (mysize > 10)
-		mysize = 10;
-	for (i = 0; i < mysize; ++i) 
-	{
-		printf("%d == %d\n", c[i], a[i] + b[i]);
-	}
-
-#if 1
-// -> main
-    //deallocate the set of actions
+    
+
+    int mysize = SIZE;
+    if (mysize > 10)
+        mysize = 10;
+    for (i = 0; i < mysize; ++i)
+    {
+        printf("%d == %d\n", c[i], a[i] + b[i]);
+    }
+
     max_actions_free(act);
 
     //unload and deallocate an engine obtained by way of max_load
     max_unload(engine);
-#endif
 
     starpu_shutdown();
 
-    return EXIT_SUCCESS;
+   return EXIT_SUCCESS;
 
 }
-
-