mariem makni 5 vuotta sitten
vanhempi
commit
6ab7f29c36

+ 90 - 4
doc/doxygen/chapters/310_data_management.doxy

@@ -54,6 +54,13 @@ starpu_data_handle_t vector_handle;
 starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 \endcode
 \endcode
 
 
+Vectors can be partitioned into pieces by using
+starpu_vector_filter_block(). They can also be partitioned with some overlapping
+by using starpu_vector_filter_block_shadow(). By default StarPU
+uses the same size for each piece. If different sizes are desired,
+starpu_vector_filter_list() or starpu_vector_filter_list_long() can be used
+instead. To just divide in two pieces, starpu_vector_filter_divide_in_2() can be used.
+
 \subsection MatrixDataInterface Matrix Data Interface
 \subsection MatrixDataInterface Matrix Data Interface
 
 
 To register 2-D matrices with a potential padding, one can use the
 To register 2-D matrices with a potential padding, one can use the
@@ -67,9 +74,15 @@ matrix = (float*)malloc(width * height * sizeof(float));
 starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
 starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
 \endcode
 \endcode
 
 
+2D matrices can be partitioned into 2D matrices along the x dimension by
+using starpu_matrix_filter_block(), and along the y dimension by using
+starpu_matrix_filter_vertical_block(). They can also be partitioned
+with some overlapping by using starpu_matrix_filter_block_shadow() and
+starpu_matrix_filter_vertical_block_shadow().
+
 \subsection BlockDataInterface Block Data Interface
 \subsection BlockDataInterface Block Data Interface
 
 
-To register 3-D blocks with potential paddings on Y and Z dimensions,
+To register 3-D matrices with potential paddings on Y and Z dimensions,
 one can use the block data interface. Here an example of how to
 one can use the block data interface. Here an example of how to
 register a block data to StarPU by using starpu_block_data_register().
 register a block data to StarPU by using starpu_block_data_register().
 
 
@@ -80,6 +93,14 @@ block = (float*)malloc(nx*ny*nz*sizeof(float));
 starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
 starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
 \endcode
 \endcode
 
 
+3D matrices can be partitioned along the x dimension by
+using starpu_block_filter_block(), or along the y dimension
+by using starpu_block_filter_vertical_block, or along the
+z dimension by using starpu_block_filter_depth_block. They
+can also be partitioned with some overlapping by using
+starpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),
+or starpu_block_filter_depth_block_shadow().
+
 \subsection BCSRDataInterface BCSR Data Interface
 \subsection BCSRDataInterface BCSR Data Interface
 
 
 BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
 BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
@@ -147,10 +168,16 @@ starpu_bcsr_data_register(&bcsr_handle,
 StarPU provides an example on how to deal with such matrices in
 StarPU provides an example on how to deal with such matrices in
 <c>examples/spmv</c>.
 <c>examples/spmv</c>.
 
 
+BCSR data handles can be partitioned into its dense matrix blocks by using
+starpu_bcsr_filter_canonical_block().
+
 \subsection CSRDataInterface CSR Data Interface
 \subsection CSRDataInterface CSR Data Interface
 
 
 TODO
 TODO
 
 
+CSR data handles can be partitioned into vertical CSR matrices by using
+starpu_csr_filter_vertical_block().
+
 \subsection VariableSizeDataInterface Data Interface with Variable Size
 \subsection VariableSizeDataInterface Data Interface with Variable Size
 
 
 Tasks are actually allowed to change the size of data interfaces.
 Tasks are actually allowed to change the size of data interfaces.
@@ -763,7 +790,11 @@ A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
 
 
 \section DefiningANewDataInterface Defining A New Data Interface
 \section DefiningANewDataInterface Defining A New Data Interface
 
 
-Let's define a new data interface to manage complex numbers.
+This section proposes an example how to define your own interface, when the
+StarPU-provided interface do not fit your needs. Here we take a dumb example of
+an array of complex numbers represented by two arrays of double values.
+
+Let's thus define a new data interface to manage arrays of complex numbers:
 
 
 \code{.c}
 \code{.c}
 /* interface for complex numbers */
 /* interface for complex numbers */
@@ -775,6 +806,15 @@ struct starpu_complex_interface
 };
 };
 \endcode
 \endcode
 
 
+That structure stores enough to describe <b>one</b> buffer of such kind of
+data. It is used for the buffer stored in the main memory, another instance
+is used for the buffer stored in a GPU, etc. A <i>data handle</i> is thus a
+collection of such structures, to remember each buffer on each memory node.
+
+Note: one should not take pointers into such structures, because StarPU needs
+to be able to copy over the content of it to various places, for instance to
+efficiently migrate a data buffer from one data handle to another data handle.
+
 Registering such a data to StarPU is easily done using the function
 Registering such a data to StarPU is easily done using the function
 starpu_data_register(). The last
 starpu_data_register(). The last
 parameter of the function, <c>interface_complex_ops</c>, will be
 parameter of the function, <c>interface_complex_ops</c>, will be
@@ -800,12 +840,41 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
 }
 }
 \endcode
 \endcode
 
 
-The <c>starpu_complex_interface</c> structure is here used just to store the
+The <c>struct starpu_complex_interface complex</c> is here used just to store the
 parameters that the user provided to <c>starpu_complex_data_register</c>.
 parameters that the user provided to <c>starpu_complex_data_register</c>.
 starpu_data_register() will first allocate the handle, and
 starpu_data_register() will first allocate the handle, and
 then pass the <c>starpu_complex_interface</c> structure to the
 then pass the <c>starpu_complex_interface</c> structure to the
 starpu_data_interface_ops::register_data_handle method, which records them
 starpu_data_interface_ops::register_data_handle method, which records them
-within the data handle (it is called once per node by starpu_data_register()).
+within the data handle (it is called once per node by starpu_data_register()):
+
+\code{.c}
+static void complex_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
+{
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
+
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		struct starpu_complex_interface *local_interface = (struct starpu_complex_interface *)
+			starpu_data_get_interface_on_node(handle, node);
+
+		local_interface->nx = complex_interface->nx;
+		if (node == home_node)
+		{
+			local_interface->real = complex_interface->real;
+			local_interface->imaginary = complex_interface->imaginary;
+		}
+		else
+		{
+			local_interface->real = NULL;
+			local_interface->imaginary = NULL;
+		}
+	}
+}
+\endcode
+
+If the application provided a home node, the corresponding pointers will be
+recorded for that node. Others have no buffer allocated yet.
 
 
 Different operations need to be defined for a data interface through
 Different operations need to be defined for a data interface through
 the type starpu_data_interface_ops. We only define here the basic
 the type starpu_data_interface_ops. We only define here the basic
@@ -932,4 +1001,21 @@ when the kernel does not make so many accesses to the second data, and thus data
 being remote e.g. over a PCI bus is not a performance problem, and avoids
 being remote e.g. over a PCI bus is not a performance problem, and avoids
 filling the fast local memory with data which does not need the performance.
 filling the fast local memory with data which does not need the performance.
 
 
+In cases where the kernel is fine with some data being either local or in the
+main memory, ::STARPU_SPECIFIC_NODE_LOCAL_OR_CPU can be used. StarPU will then
+be free to leave the data in the main memory and let the kernel access it from
+accelerators, or to move it to the accelerator before starting the kernel, for
+instance:
+
+\code{.c}
+struct starpu_codelet cl =
+{
+	.cuda_funcs = { kernel },
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R},
+	.specific_nodes = 1,
+	.nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
+};
+\endcode
+
 */
 */

+ 8 - 0
doc/doxygen/chapters/440_fpga_support.doxy

@@ -217,6 +217,14 @@ In the <c>main</c> function, there are four important steps:
 
 
 The rest of the application (data registration, task submission, etc.) is as usual with StarPU
 The rest of the application (data registration, task submission, etc.) is as usual with StarPU
 
 
+\subsection FPGADataTransfers Data Transfers in StarPU/FPGA Applications
+
+The communication between the host and the DFE is done through the <c>Dynamic advance interface</c> to exchange data between the main memory and the local memory of the DFE.
+For instant, we use \ref STARPU_MAIN_RAM to send and store data to/from DFE's local memory. However, we aim to use a multiplexer to choose which memory node we will use to read/write data. So, the user can tell that the computational kernel will take data from the main memory or DFE's local memory for example.
+
+In starPU applications, When \ref starpu_codelet::specific_nodes is 1, this specifies the memory nodes where each data should be sent to for task execution.
+  
+
 \subsection FPGAConfiguration FPGA Configuration
 \subsection FPGAConfiguration FPGA Configuration
 
 
 To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c> through the \c configure option <b>"--with-fpga"</b>.
 To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c> through the \c configure option <b>"--with-fpga"</b>.

+ 30 - 5
include/starpu_task.h

@@ -49,8 +49,7 @@ extern "C"
 */
 */
 #define STARPU_NOWHERE	((1ULL)<<0)
 #define STARPU_NOWHERE	((1ULL)<<0)
 
 
-/**
-   To be used when setting the field starpu_codelet::where (or
+/**   To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
    starpu_task::where) to specify the codelet (or the task) may be
    executed on a CPU processing unit.
    executed on a CPU processing unit.
 */
 */
@@ -243,15 +242,41 @@ typedef starpu_mpi_ms_kernel_t (*starpu_mpi_ms_func_t)(void);
 #define STARPU_VARIABLE_NBUFFERS (-1)
 #define STARPU_VARIABLE_NBUFFERS (-1)
 
 
 /**
 /**
-   Value to be set in the field starpu_codelet::nodes to request
-   StarPU to put the data in CPU-accessible memory (and let StarPU
-   choose the NUMA node).
+   Value to be set in the starpu_codelet::nodes field to request
+   StarPU to put the data in local memory of the worker running the task (this
+   is the default behavior).
 */
 */
 #define STARPU_SPECIFIC_NODE_LOCAL (-1)
 #define STARPU_SPECIFIC_NODE_LOCAL (-1)
+
+/**
+    Value to be set in the starpu_codelet::nodes field to request
+    StarPU to put the data in CPU-accessible memory (and let StarPU
+    choose the NUMA node).
+*/
+
 #define STARPU_SPECIFIC_NODE_CPU (-2)
 #define STARPU_SPECIFIC_NODE_CPU (-2)
+
+/**
+    Value to be set in the starpu_codelet::nodes field to request
+    StarPU to put the data in some slow memory.
+*/
+
 #define STARPU_SPECIFIC_NODE_SLOW (-3)
 #define STARPU_SPECIFIC_NODE_SLOW (-3)
+/**
+   Value to be set in the starpu_codelet::nodes field to request
+   StarPU to put the data in some fast memory.
+*/
+
 #define STARPU_SPECIFIC_NODE_FAST (-4)
 #define STARPU_SPECIFIC_NODE_FAST (-4)
 
 
+/**
+    Value to be set in the starpu_codelet::nodes field to let StarPU decide
+    whether to put the data in the local memory of the worker running the task,
+    or in CPU-accessible memory (and let StarPU choose the NUMA node).
+*/
+
+#define STARPU_SPECIFIC_NODE_LOCAL_OR_CPU (-5)
+
 struct starpu_task;
 struct starpu_task;
 
 
 /**
 /**

+ 1 - 1
src/core/task.c

@@ -795,7 +795,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 			/* Make sure handles are not partitioned */
 			/* Make sure handles are not partitioned */
 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
 			/* Make sure the specified node exists */
 			/* Make sure the specified node exists */
-			STARPU_ASSERT_MSG(node == STARPU_SPECIFIC_NODE_LOCAL || node == STARPU_SPECIFIC_NODE_CPU || node == STARPU_SPECIFIC_NODE_SLOW || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
+			STARPU_ASSERT_MSG(node == STARPU_SPECIFIC_NODE_LOCAL || node == STARPU_SPECIFIC_NODE_CPU || node == STARPU_SPECIFIC_NODE_SLOW || node == STARPU_SPECIFIC_NODE_LOCAL_OR_CPU || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
 			/* Provide the home interface for now if any,
 			/* Provide the home interface for now if any,
 			 * for can_execute hooks */
 			 * for can_execute hooks */
 			if (handle->home_node != -1)
 			if (handle->home_node != -1)

+ 19 - 0
src/core/topology.c

@@ -319,6 +319,16 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 		// TODO: rather leave in DDR
 		// TODO: rather leave in DDR
 		node = local_node;
 		node = local_node;
 		break;
 		break;
+        case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+			   
+			node = local_node;
+		} else {
+                        /* It is not here already, do not bother moving it */
+                        node = STARPU_MAIN_RAM;
+                    }
+		break;
+      
 	}
 	}
 	return node;
 	return node;
 }
 }
@@ -343,6 +353,15 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 		// TODO: rather leave in DDR
 		// TODO: rather leave in DDR
 		node = local_node;
 		node = local_node;
 		break;
 		break;
+        case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
+                if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+                /* It is here already, rather access it from here */
+                node = local_node;
+                 } else {
+                /* It is not here already, do not bother moving it */
+                node = STARPU_MAIN_RAM;
+                 }
+                break;
 	}
 	}
 	return node;
 	return node;
 }
 }

+ 43 - 2
tests/datawizard/specific_node.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2013,2015,2017                      CNRS
  * Copyright (C) 2011-2013,2015,2017                      CNRS
- * Copyright (C) 2010,2011,2013,2014,2016-2018            Université de Bordeaux
+ * Copyright (C) 2010,2011,2013,2014,2016-2019            Université de Bordeaux
  * Copyright (C) 2012,2017                                Inria
  * Copyright (C) 2012,2017                                Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -32,6 +32,40 @@
 
 
 unsigned data, data2;
 unsigned data, data2;
 
 
+void specific2_kernel(void *descr[], void *arg)
+{
+	(void)arg;
+	int node = starpu_task_get_current_data_node(0);
+	STARPU_ASSERT(node >= 0);
+	STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM);
+	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	if (node == STARPU_MAIN_RAM)
+		STARPU_ASSERT(dataptr == &data);
+
+	(*dataptr)++;
+
+	node = starpu_task_get_current_data_node(1);
+	STARPU_ASSERT(node >= 0);
+	STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM
+			|| (unsigned) node == starpu_worker_get_local_memory_node());
+	dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	if (node == STARPU_MAIN_RAM)
+		STARPU_ASSERT(dataptr == &data2);
+}
+
+static struct starpu_codelet specific2_cl =
+{
+	.cpu_funcs = {specific2_kernel},
+	.cuda_funcs = {specific2_kernel},
+	.opencl_funcs = {specific2_kernel},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+	.specific_nodes = 1,
+	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
+};
+
 void specific_kernel(void *descr[], void *arg)
 void specific_kernel(void *descr[], void *arg)
 {
 {
 	(void)arg;
 	(void)arg;
@@ -42,7 +76,12 @@ void specific_kernel(void *descr[], void *arg)
 
 
 	if (node == STARPU_MAIN_RAM)
 	if (node == STARPU_MAIN_RAM)
 		STARPU_ASSERT(dataptr == &data);
 		STARPU_ASSERT(dataptr == &data);
+
 	(*dataptr)++;
 	(*dataptr)++;
+
+
+	node = starpu_task_get_current_data_node(1);
+	STARPU_ASSERT((unsigned) node == starpu_worker_get_local_memory_node());
 }
 }
 
 
 static struct starpu_codelet specific_cl =
 static struct starpu_codelet specific_cl =
@@ -123,8 +162,10 @@ int main(void)
 	for (i = 0; i < ntasks; i++)
 	for (i = 0; i < ntasks; i++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
-		if (i%2)
+		if (i%3 == 0)
 			task->cl = &specific_cl;
 			task->cl = &specific_cl;
+		else if (i%3 == 1)
+			task->cl = &specific2_cl;
 		else
 		else
 			task->cl = &cl;
 			task->cl = &cl;
 		task->handles[0] = data_handle;
 		task->handles[0] = data_handle;

+ 90 - 51
tests/perfmodels/max_fpga.c

@@ -1,53 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2013,2015,2017                      CNRS
+ * Copyright (C) 2010,2011,2013,2014,2016-2018            Université de Bordeaux
+ * Copyright (C) 2012,2017                                Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Test using the specific_nodes field by forcing the data to main memory
+ * even if the task is run on a GPU (and actually doing the computation from
+ * the CPU driving the GPU). It mixes such accesses and normal accesses from
+ * the GPU
+ */
+
 #include <starpu.h>
 #include <starpu.h>
 #include <stdlib.h>
 #include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdio.h>
 #include <starpu_scheduler.h>
 #include <starpu_scheduler.h>
 #include "../helper.h"
 #include "../helper.h"
 
 
 #include "StreamFMA.h"
 #include "StreamFMA.h"
 #include "MaxSLiCInterface.h"
 #include "MaxSLiCInterface.h"
+
 #define SIZE 128
 #define SIZE 128
 
 
 static max_engine_t *engine ;
 static max_engine_t *engine ;
 static max_actions_t*act;
 static max_actions_t*act;
 static max_file_t *maxfile;
 static max_file_t *maxfile;
 
 
-void cpu_func(void *buffers[], void *cl_arg)
+void specific_kernel(void *descr[], void *arg)
 {
 {
-    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
-    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
-    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
+    (void)arg;
+    int *a = (int*) STARPU_VECTOR_GET_PTR(descr[0]);
+    int *b = (int*) STARPU_VECTOR_GET_PTR(descr[1]);
+    int *c = (int*) STARPU_VECTOR_GET_PTR(descr[2]);
 
 
-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+    int node = starpu_task_get_current_data_node(0);
+    STARPU_ASSERT(node >= 0);
+    STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM);
 
 
-    (void)buffers;
-    (void)cl_arg;
+    unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
-    int i;
-    for (i = 0; i < size; i++)
-	c[i] = a[i] + b[i];
+   // int i;
+   // for (i = 0; i < size; i++)
+     //   c[i] = a[i]+b[i];
+
+    if (node == STARPU_MAIN_RAM)
+        STARPU_ASSERT(dataptr == &c);
+    (*dataptr)++;
 }
 }
 
 
-void fpga_mult(void *buffers[], void *cl_arg)
-{   
-    (void)cl_arg;
-    
-    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
-    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
-    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
+void fpga_mult(void *descr[], void *arg)
+{
+    (void)arg;
+
+    int *a = (int*) STARPU_VECTOR_GET_PTR(descr[0]);
+    int *b = (int*) STARPU_VECTOR_GET_PTR(descr[1]);
+    int *c = (int*) STARPU_VECTOR_GET_PTR(descr[2]);
 
 
-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+    int size = STARPU_VECTOR_GET_NX(descr[0]);
+
+    int node = starpu_task_get_current_data_node(2);
+    
+    if (node == STARPU_MAIN_RAM)
+     {
+          printf("hellooooo ...\n");
+     }
+    else {
+     printf("byeee....\n");
+    }
 
 
     //Actions to run on an engine
     //Actions to run on an engine
     act = max_actions_init(maxfile, NULL);
     act = max_actions_init(maxfile, NULL);
 
 
     //set the number of ticks for a kernel
     //set the number of ticks for a kernel
-    max_set_ticks  (act, "StreamFMAKernel", size);
+    max_set_ticks (act, "StreamFMAKernel", size);
+
+    max_queue_input(act, "a", a, size *sizeof(a[0]));
 
 
-    max_queue_input(act, "a", a, size *sizeof(a[0])); 
     max_queue_input(act, "b", b, size*sizeof(b[0]));
     max_queue_input(act, "b", b, size*sizeof(b[0]));
     max_queue_output(act,"output", c, size*sizeof(c[0]));
     max_queue_output(act,"output", c, size*sizeof(c[0]));
 
 
+
     //run actions on the engine
     //run actions on the engine
     printf("Running on DFE using dynamic interface ...\n");
     printf("Running on DFE using dynamic interface ...\n");
 
 
@@ -57,20 +103,19 @@ void fpga_mult(void *buffers[], void *cl_arg)
     max_run_t *run0= max_run_nonblock(engine, act);
     max_run_t *run0= max_run_nonblock(engine, act);
 
 
     printf("*** wait for the actions on DFE to complete *** \n");
     printf("*** wait for the actions on DFE to complete *** \n");
+    //wait for the actions to complete
     max_wait(run0);
     max_wait(run0);
-   // max_run(engine, act);
-    
-  }
 
 
-static struct starpu_codelet cl =
+}
+
+static struct starpu_codelet specific_cl =
 {
 {
-    .cpu_funcs = {cpu_func},
-    .cpu_funcs_name = {"cpu_func"},
-//#ifdef STARPU_USE_FPGA
+    .cpu_funcs = {specific_kernel},
     .fpga_funcs = {fpga_mult},
     .fpga_funcs = {fpga_mult},
-//#endif
     .nbuffers = 3,
     .nbuffers = 3,
-    .modes = {STARPU_R, STARPU_R, STARPU_W}
+    .modes = {STARPU_R,STARPU_R, STARPU_W},
+    .specific_nodes = 1,
+    .nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
 };
 };
 
 
 
 
@@ -95,7 +140,7 @@ int main(int argc, char **argv)
     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
     //Implementation of a maxfile
     //Implementation of a maxfile
-     maxfile = StreamFMA_init();
+    maxfile = StreamFMA_init();
 
 
     //Implementation of an engine
     //Implementation of an engine
     engine = max_load(maxfile, "*");
     engine = max_load(maxfile, "*");
@@ -115,46 +160,40 @@ int main(int argc, char **argv)
     starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(int));
     starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(int));
     starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(int));
     starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(int));
 
 
+
     struct starpu_task *task = starpu_task_create();
     struct starpu_task *task = starpu_task_create();
-    task->cl = &cl;
+
+    task->cl = &specific_cl;
     task->handles[0] = handle_a;
     task->handles[0] = handle_a;
     task->handles[1] = handle_b;
     task->handles[1] = handle_b;
     task->handles[2] = handle_c;
     task->handles[2] = handle_c;
     
     
     task->synchronous = 1;
     task->synchronous = 1;
     task->destroy = 0;
     task->destroy = 0;
-    /* submit the task to StarPU */
-
-    //starpu_task_destroy(task);
     ret = starpu_task_submit(task);
     ret = starpu_task_submit(task);
-    
+
     fprintf(stderr,"task submitted %d\n", ret);
     fprintf(stderr,"task submitted %d\n", ret);
 
 
     starpu_data_unregister(handle_a);
     starpu_data_unregister(handle_a);
     starpu_data_unregister(handle_b);
     starpu_data_unregister(handle_b);
     starpu_data_unregister(handle_c);
     starpu_data_unregister(handle_c);
-  
-	int mysize = SIZE;
-	if (mysize > 10)
-		mysize = 10;
-	for (i = 0; i < mysize; ++i) 
-	{
-		printf("%d == %d\n", c[i], a[i] + b[i]);
-	}
-
-#if 1
-// -> main
-    //deallocate the set of actions
+    
+
+    int mysize = SIZE;
+    if (mysize > 10)
+        mysize = 10;
+    for (i = 0; i < mysize; ++i)
+    {
+        printf("%d == %d\n", c[i], a[i] + b[i]);
+    }
+
     max_actions_free(act);
     max_actions_free(act);
 
 
     //unload and deallocate an engine obtained by way of max_load
     //unload and deallocate an engine obtained by way of max_load
     max_unload(engine);
     max_unload(engine);
-#endif
 
 
     starpu_shutdown();
     starpu_shutdown();
 
 
-    return EXIT_SUCCESS;
+   return EXIT_SUCCESS;
 
 
 }
 }
-
-