vor 5 Jahren · 6ab7f29c36
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -54,6 +54,13 @@ starpu_data_handle_t vector_handle;
 
				 starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				 \endcode
			
 
				 
			
 
				+Vectors can be partitioned into pieces by using
			
 
				+starpu_vector_filter_block(). They can also be partitioned with some overlapping
			
 
				+by using starpu_vector_filter_block_shadow(). By default StarPU
			
 
				+uses the same size for each piece. If different sizes are desired,
			
 
				+starpu_vector_filter_list() or starpu_vector_filter_list_long() can be used
			
 
				+instead. To just divide in two pieces, starpu_vector_filter_divide_in_2() can be used.
			
 
				+
			
 
				 \subsection MatrixDataInterface Matrix Data Interface
			
 
				 
			
 
				 To register 2-D matrices with a potential padding, one can use the
			
@@ -67,9 +74,15 @@ matrix = (float*)malloc(width * height * sizeof(float));
 
				 starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
			
 
				 \endcode
			
 
				 
			
 
				+2D matrices can be partitioned into 2D matrices along the x dimension by
			
 
				+using starpu_matrix_filter_block(), and along the y dimension by using
			
 
				+starpu_matrix_filter_vertical_block(). They can also be partitioned
			
 
				+with some overlapping by using starpu_matrix_filter_block_shadow() and
			
 
				+starpu_matrix_filter_vertical_block_shadow().
			
 
				+
			
 
				 \subsection BlockDataInterface Block Data Interface
			
 
				 
			
 
				-To register 3-D blocks with potential paddings on Y and Z dimensions,
			
 
				+To register 3-D matrices with potential paddings on Y and Z dimensions,
			
 
				 one can use the block data interface. Here an example of how to
			
 
				 register a block data to StarPU by using starpu_block_data_register().
			
 
				 
			
@@ -80,6 +93,14 @@ block = (float*)malloc(nx*ny*nz*sizeof(float));
 
				 starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
			
 
				 \endcode
			
 
				 
			
 
				+3D matrices can be partitioned along the x dimension by
			
 
				+using starpu_block_filter_block(), or along the y dimension
			
 
				+by using starpu_block_filter_vertical_block, or along the
			
 
				+z dimension by using starpu_block_filter_depth_block. They
			
 
				+can also be partitioned with some overlapping by using
			
 
				+starpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),
			
 
				+or starpu_block_filter_depth_block_shadow().
			
 
				+
			
 
				 \subsection BCSRDataInterface BCSR Data Interface
			
 
				 
			
 
				 BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
			
@@ -147,10 +168,16 @@ starpu_bcsr_data_register(&bcsr_handle,
 
				 StarPU provides an example on how to deal with such matrices in
			
 
				 <c>examples/spmv</c>.
			
 
				 
			
 
				+BCSR data handles can be partitioned into its dense matrix blocks by using
			
 
				+starpu_bcsr_filter_canonical_block().
			
 
				+
			
 
				 \subsection CSRDataInterface CSR Data Interface
			
 
				 
			
 
				 TODO
			
 
				 
			
 
				+CSR data handles can be partitioned into vertical CSR matrices by using
			
 
				+starpu_csr_filter_vertical_block().
			
 
				+
			
 
				 \subsection VariableSizeDataInterface Data Interface with Variable Size
			
 
				 
			
 
				 Tasks are actually allowed to change the size of data interfaces.
			
@@ -763,7 +790,11 @@ A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
 
				 
			
 
				 \section DefiningANewDataInterface Defining A New Data Interface
			
 
				 
			
 
				-Let's define a new data interface to manage complex numbers.
			
 
				+This section proposes an example how to define your own interface, when the
			
 
				+StarPU-provided interface do not fit your needs. Here we take a dumb example of
			
 
				+an array of complex numbers represented by two arrays of double values.
			
 
				+
			
 
				+Let's thus define a new data interface to manage arrays of complex numbers:
			
 
				 
			
 
				 \code{.c}
			
 
				 /* interface for complex numbers */
			
@@ -775,6 +806,15 @@ struct starpu_complex_interface
 
				 };
			
 
				 \endcode
			
 
				 
			
 
				+That structure stores enough to describe <b>one</b> buffer of such kind of
			
 
				+data. It is used for the buffer stored in the main memory, another instance
			
 
				+is used for the buffer stored in a GPU, etc. A <i>data handle</i> is thus a
			
 
				+collection of such structures, to remember each buffer on each memory node.
			
 
				+
			
 
				+Note: one should not take pointers into such structures, because StarPU needs
			
 
				+to be able to copy over the content of it to various places, for instance to
			
 
				+efficiently migrate a data buffer from one data handle to another data handle.
			
 
				+
			
 
				 Registering such a data to StarPU is easily done using the function
			
 
				 starpu_data_register(). The last
			
 
				 parameter of the function, <c>interface_complex_ops</c>, will be
			
@@ -800,12 +840,41 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-The <c>starpu_complex_interface</c> structure is here used just to store the
			
 
				+The <c>struct starpu_complex_interface complex</c> is here used just to store the
			
 
				 parameters that the user provided to <c>starpu_complex_data_register</c>.
			
 
				 starpu_data_register() will first allocate the handle, and
			
 
				 then pass the <c>starpu_complex_interface</c> structure to the
			
 
				 starpu_data_interface_ops::register_data_handle method, which records them
			
 
				-within the data handle (it is called once per node by starpu_data_register()).
			
 
				+within the data handle (it is called once per node by starpu_data_register()):
			
 
				+
			
 
				+\code{.c}
			
 
				+static void complex_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
			
 
				+{
			
 
				+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
			
 
				+
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+	{
			
 
				+		struct starpu_complex_interface *local_interface = (struct starpu_complex_interface *)
			
 
				+			starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+		local_interface->nx = complex_interface->nx;
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				+			local_interface->real = complex_interface->real;
			
 
				+			local_interface->imaginary = complex_interface->imaginary;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			local_interface->real = NULL;
			
 
				+			local_interface->imaginary = NULL;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+If the application provided a home node, the corresponding pointers will be
			
 
				+recorded for that node. Others have no buffer allocated yet.
			
 
				 
			
 
				 Different operations need to be defined for a data interface through
			
 
				 the type starpu_data_interface_ops. We only define here the basic
			
@@ -932,4 +1001,21 @@ when the kernel does not make so many accesses to the second data, and thus data
 
				 being remote e.g. over a PCI bus is not a performance problem, and avoids
			
 
				 filling the fast local memory with data which does not need the performance.
			
 
				 
			
 
				+In cases where the kernel is fine with some data being either local or in the
			
 
				+main memory, ::STARPU_SPECIFIC_NODE_LOCAL_OR_CPU can be used. StarPU will then
			
 
				+be free to leave the data in the main memory and let the kernel access it from
			
 
				+accelerators, or to move it to the accelerator before starting the kernel, for
			
 
				+instance:
			
 
				+
			
 
				+\code{.c}
			
 
				+struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cuda_funcs = { kernel },
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R},
			
 
				+	.specific_nodes = 1,
			
 
				+	.nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/440_fpga_support.doxy
+++ b/doc/doxygen/chapters/440_fpga_support.doxy
@@ -217,6 +217,14 @@ In the <c>main</c> function, there are four important steps:
 
				 
			
 
				 The rest of the application (data registration, task submission, etc.) is as usual with StarPU
			
 
				 
			
 
				+\subsection FPGADataTransfers Data Transfers in StarPU/FPGA Applications
			
 
				+
			
 
				+The communication between the host and the DFE is done through the <c>Dynamic advance interface</c> to exchange data between the main memory and the local memory of the DFE.
			
 
				+For instant, we use \ref STARPU_MAIN_RAM to send and store data to/from DFE's local memory. However, we aim to use a multiplexer to choose which memory node we will use to read/write data. So, the user can tell that the computational kernel will take data from the main memory or DFE's local memory for example.
			
 
				+
			
 
				+In starPU applications, When \ref starpu_codelet::specific_nodes is 1, this specifies the memory nodes where each data should be sent to for task execution.
			
 
				+  
			
 
				+
			
 
				 \subsection FPGAConfiguration FPGA Configuration
			
 
				 
			
 
				 To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c> through the \c configure option <b>"--with-fpga"</b>.
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -49,8 +49,7 @@ extern "C"
 
				 */
			
 
				 #define STARPU_NOWHERE	((1ULL)<<0)
			
 
				 
			
 
				-/**
			
 
				-   To be used when setting the field starpu_codelet::where (or
			
 
				+/**   To be used when setting the field starpu_codelet::where (or
			
 
				    starpu_task::where) to specify the codelet (or the task) may be
			
 
				    executed on a CPU processing unit.
			
 
				 */
			
@@ -243,15 +242,41 @@ typedef starpu_mpi_ms_kernel_t (*starpu_mpi_ms_func_t)(void);
 
				 #define STARPU_VARIABLE_NBUFFERS (-1)
			
 
				 
			
 
				 /**
			
 
				-   Value to be set in the field starpu_codelet::nodes to request
			
 
				-   StarPU to put the data in CPU-accessible memory (and let StarPU
			
 
				-   choose the NUMA node).
			
 
				+   Value to be set in the starpu_codelet::nodes field to request
			
 
				+   StarPU to put the data in local memory of the worker running the task (this
			
 
				+   is the default behavior).
			
 
				 */
			
 
				 #define STARPU_SPECIFIC_NODE_LOCAL (-1)
			
 
				+
			
 
				+/**
			
 
				+    Value to be set in the starpu_codelet::nodes field to request
			
 
				+    StarPU to put the data in CPU-accessible memory (and let StarPU
			
 
				+    choose the NUMA node).
			
 
				+*/
			
 
				+
			
 
				 #define STARPU_SPECIFIC_NODE_CPU (-2)
			
 
				+
			
 
				+/**
			
 
				+    Value to be set in the starpu_codelet::nodes field to request
			
 
				+    StarPU to put the data in some slow memory.
			
 
				+*/
			
 
				+
			
 
				 #define STARPU_SPECIFIC_NODE_SLOW (-3)
			
 
				+/**
			
 
				+   Value to be set in the starpu_codelet::nodes field to request
			
 
				+   StarPU to put the data in some fast memory.
			
 
				+*/
			
 
				+
			
 
				 #define STARPU_SPECIFIC_NODE_FAST (-4)
			
 
				 
			
 
				+/**
			
 
				+    Value to be set in the starpu_codelet::nodes field to let StarPU decide
			
 
				+    whether to put the data in the local memory of the worker running the task,
			
 
				+    or in CPU-accessible memory (and let StarPU choose the NUMA node).
			
 
				+*/
			
 
				+
			
 
				+#define STARPU_SPECIFIC_NODE_LOCAL_OR_CPU (-5)
			
 
				+
			
 
				 struct starpu_task;
			
 
				 
			
 
				 /**
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -795,7 +795,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 
				 			/* Make sure handles are not partitioned */
			
 
				 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
			
 
				 			/* Make sure the specified node exists */
			
 
				-			STARPU_ASSERT_MSG(node == STARPU_SPECIFIC_NODE_LOCAL || node == STARPU_SPECIFIC_NODE_CPU || node == STARPU_SPECIFIC_NODE_SLOW || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
			
 
				+			STARPU_ASSERT_MSG(node == STARPU_SPECIFIC_NODE_LOCAL || node == STARPU_SPECIFIC_NODE_CPU || node == STARPU_SPECIFIC_NODE_SLOW || node == STARPU_SPECIFIC_NODE_LOCAL_OR_CPU || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
			
 
				 			/* Provide the home interface for now if any,
			
 
				 			 * for can_execute hooks */
			
 
				 			if (handle->home_node != -1)
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -319,6 +319,16 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 
				 		// TODO: rather leave in DDR
			
 
				 		node = local_node;
			
 
				 		break;
			
 
				+        case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
			
 
				+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
			
 
				+			   
			
 
				+			node = local_node;
			
 
				+		} else {
			
 
				+                        /* It is not here already, do not bother moving it */
			
 
				+                        node = STARPU_MAIN_RAM;
			
 
				+                    }
			
 
				+		break;
			
 
				+      
			
 
				 	}
			
 
				 	return node;
			
 
				 }
			
@@ -343,6 +353,15 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 
				 		// TODO: rather leave in DDR
			
 
				 		node = local_node;
			
 
				 		break;
			
 
				+        case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
			
 
				+                if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
			
 
				+                /* It is here already, rather access it from here */
			
 
				+                node = local_node;
			
 
				+                 } else {
			
 
				+                /* It is not here already, do not bother moving it */
			
 
				+                node = STARPU_MAIN_RAM;
			
 
				+                 }
			
 
				+                break;
			
 
				 	}
			
 
				 	return node;
			
 
				 }
			
--- a/tests/datawizard/specific_node.c
+++ b/tests/datawizard/specific_node.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015,2017                      CNRS
			
 
				- * Copyright (C) 2010,2011,2013,2014,2016-2018            Université de Bordeaux
			
 
				+ * Copyright (C) 2010,2011,2013,2014,2016-2019            Université de Bordeaux
			
 
				  * Copyright (C) 2012,2017                                Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -32,6 +32,40 @@
 
				 
			
 
				 unsigned data, data2;
			
 
				 
			
 
				+void specific2_kernel(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	int node = starpu_task_get_current_data_node(0);
			
 
				+	STARPU_ASSERT(node >= 0);
			
 
				+	STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM);
			
 
				+	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+
			
 
				+	if (node == STARPU_MAIN_RAM)
			
 
				+		STARPU_ASSERT(dataptr == &data);
			
 
				+
			
 
				+	(*dataptr)++;
			
 
				+
			
 
				+	node = starpu_task_get_current_data_node(1);
			
 
				+	STARPU_ASSERT(node >= 0);
			
 
				+	STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM
			
 
				+			|| (unsigned) node == starpu_worker_get_local_memory_node());
			
 
				+	dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	if (node == STARPU_MAIN_RAM)
			
 
				+		STARPU_ASSERT(dataptr == &data2);
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet specific2_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {specific2_kernel},
			
 
				+	.cuda_funcs = {specific2_kernel},
			
 
				+	.opencl_funcs = {specific2_kernel},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_RW},
			
 
				+	.specific_nodes = 1,
			
 
				+	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
			
 
				+};
			
 
				+
			
 
				 void specific_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				 	(void)arg;
			
@@ -42,7 +76,12 @@ void specific_kernel(void *descr[], void *arg)
 
				 
			
 
				 	if (node == STARPU_MAIN_RAM)
			
 
				 		STARPU_ASSERT(dataptr == &data);
			
 
				+
			
 
				 	(*dataptr)++;
			
 
				+
			
 
				+
			
 
				+	node = starpu_task_get_current_data_node(1);
			
 
				+	STARPU_ASSERT((unsigned) node == starpu_worker_get_local_memory_node());
			
 
				 }
			
 
				 
			
 
				 static struct starpu_codelet specific_cl =
			
@@ -123,8 +162,10 @@ int main(void)
 
				 	for (i = 0; i < ntasks; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				-		if (i%2)
			
 
				+		if (i%3 == 0)
			
 
				 			task->cl = &specific_cl;
			
 
				+		else if (i%3 == 1)
			
 
				+			task->cl = &specific2_cl;
			
 
				 		else
			
 
				 			task->cl = &cl;
			
 
				 		task->handles[0] = data_handle;
			
--- a/tests/perfmodels/max_fpga.c
+++ b/tests/perfmodels/max_fpga.c
@@ -1,53 +1,99 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011-2013,2015,2017                      CNRS
			
 
				+ * Copyright (C) 2010,2011,2013,2014,2016-2018            Université de Bordeaux
			
 
				+ * Copyright (C) 2012,2017                                Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Test using the specific_nodes field by forcing the data to main memory
			
 
				+ * even if the task is run on a GPU (and actually doing the computation from
			
 
				+ * the CPU driving the GPU). It mixes such accesses and normal accesses from
			
 
				+ * the GPU
			
 
				+ */
			
 
				+
			
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				+#include <unistd.h>
			
 
				+#include <errno.h>
			
 
				 #include <stdio.h>
			
 
				 #include <starpu_scheduler.h>
			
 
				 #include "../helper.h"
			
 
				 
			
 
				 #include "StreamFMA.h"
			
 
				 #include "MaxSLiCInterface.h"
			
 
				+
			
 
				 #define SIZE 128
			
 
				 
			
 
				 static max_engine_t *engine ;
			
 
				 static max_actions_t*act;
			
 
				 static max_file_t *maxfile;
			
 
				 
			
 
				-void cpu_func(void *buffers[], void *cl_arg)
			
 
				+void specific_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				-    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				-    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
			
 
				+    (void)arg;
			
 
				+    int *a = (int*) STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+    int *b = (int*) STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+    int *c = (int*) STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				 
			
 
				-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    int node = starpu_task_get_current_data_node(0);
			
 
				+    STARPU_ASSERT(node >= 0);
			
 
				+    STARPU_ASSERT(starpu_node_get_kind(node) == STARPU_CPU_RAM);
			
 
				 
			
 
				-    (void)buffers;
			
 
				-    (void)cl_arg;
			
 
				+    unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				-    int i;
			
 
				-    for (i = 0; i < size; i++)
			
 
				-	c[i] = a[i] + b[i];
			
 
				+   // int i;
			
 
				+   // for (i = 0; i < size; i++)
			
 
				+     //   c[i] = a[i]+b[i];
			
 
				+
			
 
				+    if (node == STARPU_MAIN_RAM)
			
 
				+        STARPU_ASSERT(dataptr == &c);
			
 
				+    (*dataptr)++;
			
 
				 }
			
 
				 
			
 
				-void fpga_mult(void *buffers[], void *cl_arg)
			
 
				-{   
			
 
				-    (void)cl_arg;
			
 
				-    
			
 
				-    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				-    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
			
 
				+void fpga_mult(void *descr[], void *arg)
			
 
				+{
			
 
				+    (void)arg;
			
 
				+
			
 
				+    int *a = (int*) STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+    int *b = (int*) STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+    int *c = (int*) STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				 
			
 
				-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    int size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+
			
 
				+    int node = starpu_task_get_current_data_node(2);
			
 
				+    
			
 
				+    if (node == STARPU_MAIN_RAM)
			
 
				+     {
			
 
				+          printf("hellooooo ...\n");
			
 
				+     }
			
 
				+    else {
			
 
				+     printf("byeee....\n");
			
 
				+    }
			
 
				 
			
 
				     //Actions to run on an engine
			
 
				     act = max_actions_init(maxfile, NULL);
			
 
				 
			
 
				     //set the number of ticks for a kernel
			
 
				-    max_set_ticks  (act, "StreamFMAKernel", size);
			
 
				+    max_set_ticks (act, "StreamFMAKernel", size);
			
 
				+
			
 
				+    max_queue_input(act, "a", a, size *sizeof(a[0]));
			
 
				 
			
 
				-    max_queue_input(act, "a", a, size *sizeof(a[0])); 
			
 
				     max_queue_input(act, "b", b, size*sizeof(b[0]));
			
 
				     max_queue_output(act,"output", c, size*sizeof(c[0]));
			
 
				 
			
 
				+
			
 
				     //run actions on the engine
			
 
				     printf("Running on DFE using dynamic interface ...\n");
			
 
				 
			
@@ -57,20 +103,19 @@ void fpga_mult(void *buffers[], void *cl_arg)
 
				     max_run_t *run0= max_run_nonblock(engine, act);
			
 
				 
			
 
				     printf("*** wait for the actions on DFE to complete *** \n");
			
 
				+    //wait for the actions to complete
			
 
				     max_wait(run0);
			
 
				-   // max_run(engine, act);
			
 
				-    
			
 
				-  }
			
 
				 
			
 
				-static struct starpu_codelet cl =
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet specific_cl =
			
 
				 {
			
 
				-    .cpu_funcs = {cpu_func},
			
 
				-    .cpu_funcs_name = {"cpu_func"},
			
 
				-//#ifdef STARPU_USE_FPGA
			
 
				+    .cpu_funcs = {specific_kernel},
			
 
				     .fpga_funcs = {fpga_mult},
			
 
				-//#endif
			
 
				     .nbuffers = 3,
			
 
				-    .modes = {STARPU_R, STARPU_R, STARPU_W}
			
 
				+    .modes = {STARPU_R,STARPU_R, STARPU_W},
			
 
				+    .specific_nodes = 1,
			
 
				+    .nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
			
 
				 };
			
 
				 
			
 
				 
			
@@ -95,7 +140,7 @@ int main(int argc, char **argv)
 
				     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				     //Implementation of a maxfile
			
 
				-     maxfile = StreamFMA_init();
			
 
				+    maxfile = StreamFMA_init();
			
 
				 
			
 
				     //Implementation of an engine
			
 
				     engine = max_load(maxfile, "*");
			
@@ -115,46 +160,40 @@ int main(int argc, char **argv)
 
				     starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(int));
			
 
				     starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(int));
			
 
				 
			
 
				+
			
 
				     struct starpu_task *task = starpu_task_create();
			
 
				-    task->cl = &cl;
			
 
				+
			
 
				+    task->cl = &specific_cl;
			
 
				     task->handles[0] = handle_a;
			
 
				     task->handles[1] = handle_b;
			
 
				     task->handles[2] = handle_c;
			
 
				     
			
 
				     task->synchronous = 1;
			
 
				     task->destroy = 0;
			
 
				-    /* submit the task to StarPU */
			
 
				-
			
 
				-    //starpu_task_destroy(task);
			
 
				     ret = starpu_task_submit(task);
			
 
				-    
			
 
				+
			
 
				     fprintf(stderr,"task submitted %d\n", ret);
			
 
				 
			
 
				     starpu_data_unregister(handle_a);
			
 
				     starpu_data_unregister(handle_b);
			
 
				     starpu_data_unregister(handle_c);
			
 
				-  
			
 
				-	int mysize = SIZE;
			
 
				-	if (mysize > 10)
			
 
				-		mysize = 10;
			
 
				-	for (i = 0; i < mysize; ++i) 
			
 
				-	{
			
 
				-		printf("%d == %d\n", c[i], a[i] + b[i]);
			
 
				-	}
			
 
				-
			
 
				-#if 1
			
 
				-// -> main
			
 
				-    //deallocate the set of actions
			
 
				+    
			
 
				+
			
 
				+    int mysize = SIZE;
			
 
				+    if (mysize > 10)
			
 
				+        mysize = 10;
			
 
				+    for (i = 0; i < mysize; ++i)
			
 
				+    {
			
 
				+        printf("%d == %d\n", c[i], a[i] + b[i]);
			
 
				+    }
			
 
				+
			
 
				     max_actions_free(act);
			
 
				 
			
 
				     //unload and deallocate an engine obtained by way of max_load
			
 
				     max_unload(engine);
			
 
				-#endif
			
 
				 
			
 
				     starpu_shutdown();
			
 
				 
			
 
				-    return EXIT_SUCCESS;
			
 
				+   return EXIT_SUCCESS;
			
 
				 
			
 
				 }
			
 
				-
			
 
				-