5 år sedan · 16ce791ff6
--- a/doc/doxygen/440_fpga_support.doxy
+++ b/doc/doxygen/440_fpga_support.doxy
@@ -1,48 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2019                          CNRS
			
 
				- * Copyright (C) 2019                          Inria
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*! \page FPGASupport FPGA Support
			
 
				-
			
 
				-\section FPGA FPGA 
			
 
				-
			
 
				-The use of specialized hardware such as accelerators or coprocessors offers an
			
 
				-interesting approach to overcome the physical limits encountered by processor
			
 
				-architects. As a result, many machines are now equipped with one or several
			
 
				-accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
			
 
				-efforts have been devoted to offload computation onto such accelerators, very
			
 
				-little attention as been paid to portability concerns on the one hand, and to the
			
 
				-possibility of having heterogeneous accelerators and processors to interact on the other hand.
			
 
				-
			
 
				- In addition, StarPU comes with programming language support, in the form of an OpenCL front-end (\ref FPGAextensions).
			
 
				-
			
 
				-\section PortingApplicationsToFPGA Porting Applications To FPGA
			
 
				-
			
 
				-The way to port an application to FPGA is to set the field
			
 
				-starpu_codelet::fpga_funcs, to provide StarPU with the function
			
 
				-for FPGA implementation, so for instance:
			
 
				-
			
 
				-\verbatim
			
 
				-struct starpu_codelet cl =
			
 
				-{
			
 
				-    .fpga_funcs = {myfunc},
			
 
				-    .nbuffers = 1,
			
 
				-}
			
 
				-\endverbatim
			
 
				-
			
 
				-
			
 
				-
			
 
				-*/
			
--- a/doc/doxygen/chapters/440_fpga_support.doxy
+++ b/doc/doxygen/chapters/440_fpga_support.doxy
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2019                          CNRS
			
 
				- * Copyright (C) 2019                          Inria
			
 
				+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/doc/doxygen/chapters/api/fpga_extensions.doxy
+++ b/doc/doxygen/chapters/api/fpga_extensions.doxy
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2019                                     CNRS
			
 
				- * Copyright (C) 2019                                     Inria
			
 
				+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/basic_examples/mult-fpga.c
+++ b/examples/basic_examples/mult-fpga.c
@@ -1,8 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2013, 2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2010       Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -325,11 +325,9 @@ struct starpu_data_copy_methods
 
				 	   core.
			
 
				 	*/
			
 
				 
			
 
				-
			
 
				 	int (*ram_to_fpga_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 	int (*fpga_to_ram_async)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
			
 
				 
			
 
				-
			
 
				 	int (*mic_to_ram_async)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
			
 
				 
			
 
				 	/**
			
--- a/include/starpu_driver.h
+++ b/include/starpu_driver.h
@@ -54,7 +54,7 @@ struct starpu_driver
 
				 		unsigned cpu_id;
			
 
				 		unsigned cuda_id;
			
 
				 #if defined(STARPU_USE_FPGA)
			
 
				-	  unsigned fpga_id;
			
 
				+		unsigned fpga_id;
			
 
				 #endif
			
 
				 #if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
 
				 		cl_device_id opencl_id;
			
--- a/include/starpu_fpga.h
+++ b/include/starpu_fpga.h
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012, 2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2019  CNRS
			
 
				+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -66,8 +66,7 @@ extern "C"
 
				    starpu_task::where) to specify the codelet (or the task) may be
			
 
				    executed on a MAX FPGA.
			
 
				 */
			
 
				-
			
 
				-#define STARPU_FPGA	((1ULL)<<9)
			
 
				+#define STARPU_FPGA	((1ULL)<<4)
			
 
				 
			
 
				 /**
			
 
				    To be used when setting the field starpu_codelet::where (or
			
@@ -178,7 +177,6 @@ typedef void (*starpu_cuda_func_t)(void **, void*);
 
				 /**
			
 
				    FPGA implementation of a codelet.
			
 
				 */
			
 
				-
			
 
				 typedef void (*starpu_fpga_func_t)(void **, void*);
			
 
				 
			
 
				 /**
			
@@ -231,7 +229,6 @@ typedef starpu_mpi_ms_kernel_t (*starpu_mpi_ms_func_t)(void);
 
				    this macro is deprecated. One should always only define the field
			
 
				    starpu_codelet::fpga_funcs.
			
 
				 */
			
 
				-
			
 
				 #define STARPU_MULTIPLE_FPGA_IMPLEMENTATIONS   ((starpu_fpga_func_t) -1)
			
 
				 
			
 
				 /**
			
@@ -352,7 +349,6 @@ struct starpu_codelet
 
				 	   Optional field which has been made deprecated. One should
			
 
				 	   use instead the starpu_codelet::fpga_funcs field.
			
 
				 	*/
			
 
				-
			
 
				 	starpu_fpga_func_t fpga_func STARPU_DEPRECATED;
			
 
				 
			
 
				         /**
			
@@ -360,7 +356,6 @@ struct starpu_codelet
 
				 	   Optional field which has been made deprecated. One should
			
 
				 	   use instead the starpu_codelet::opencl_funcs field.
			
 
				 	*/
			
 
				-
			
 
				 	starpu_opencl_func_t opencl_func STARPU_DEPRECATED;
			
 
				 
			
 
				 	/**
			
@@ -410,7 +405,6 @@ struct starpu_codelet
 
				            ::STARPU_FPGA does not appear in the field
			
 
				            starpu_codelet::where, it must be non-<c>NULL</c> otherwise.
			
 
				         */
			
 
				-
			
 
				 	starpu_fpga_func_t fpga_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	/**
			
@@ -482,10 +476,9 @@ struct starpu_codelet
 
				 	*/
			
 
				 	const char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				-	/** 
			
 
				-	 fpga kernel type
			
 
				+	/**
			
 
				+	   fpga kernel type
			
 
				         */
			
 
				-
			
 
				 	char *fpga_kernel_type[STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	/**
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -539,8 +539,7 @@ static inline int _starpu_get_next_cuda_gpuid(struct _starpu_machine_config *con
 
				 #endif
			
 
				 
			
 
				 #if defined(STARPU_USE_FPGA)
			
 
				-static void
			
 
				-_starpu_initialize_workers_fpga_fpgaid (struct _starpu_machine_config *config)
			
 
				+static void _starpu_initialize_workers_fpga_fpgaid (struct _starpu_machine_config *config)
			
 
				 {
			
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				 	struct starpu_conf *uconf = &config->conf;
			
@@ -556,11 +555,9 @@ _starpu_initialize_workers_fpga_fpgaid (struct _starpu_machine_config *config)
 
				 		STARPU_FPGA_WORKER);
			
 
				 }
			
 
				 
			
 
				-static inline int
			
 
				-_starpu_get_next_fpga_fpgaid (struct _starpu_machine_config *config)
			
 
				+static inline int _starpu_get_next_fpga_fpgaid (struct _starpu_machine_config *config)
			
 
				 {
			
 
				-	unsigned i =
			
 
				-		((config->current_fpga_fpgaid++) % config->topology.nfpgafpgas);
			
 
				+	unsigned i = ((config->current_fpga_fpgaid++) % config->topology.nfpgafpgas);
			
 
				 
			
 
				 	return (int)config->topology.workers_fpga_fpgaid[i];
			
 
				 }
			
@@ -1180,7 +1177,7 @@ unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config)
 
				 		_starpu_init_cuda();
			
 
				 #endif
			
 
				 
			
 
				-#if defined(STARPU_USE_FPGA) 
			
 
				+#if defined(STARPU_USE_FPGA)
			
 
				 	_starpu_init_fpga();
			
 
				 #endif
			
 
				 	_starpu_init_topology(config);
			
@@ -1791,7 +1788,7 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 #endif
			
 
				 
			
 
				 
			
 
				-#if defined(STARPU_USE_FPGA) 
			
 
				+#if defined(STARPU_USE_FPGA)
			
 
				 	int nfpga = config->conf.nfpga;
			
 
				 	if (nfpga != 0)
			
 
				 	{
			
@@ -2807,7 +2804,7 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 
				 					fpga_init[devid] = 1;
			
 
				 					workerarg->bindid = fpga_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
			
 
				 
			
 
				-					memory_node = fpga_memory_nodes[devid] = _starpu_memory_node_register(STARPU_FPGA_RAM, devid, &_starpu_driver_fpga_node_ops); 
			
 
				+					memory_node = fpga_memory_nodes[devid] = _starpu_memory_node_register(STARPU_FPGA_RAM, devid, &_starpu_driver_fpga_node_ops);
			
 
				 					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				 					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				 
			
@@ -3099,7 +3096,7 @@ int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_conf
 
				 				else if (config->cpus_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				 					config->cpus_nodeid = -2;
			
 
				 				break;
			
 
				-                        
			
 
				+
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				 				if (config->cuda_nodeid == -1)
			
 
				 					config->cuda_nodeid = starpu_worker_get_memory_node(i);
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -38,7 +38,7 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 
				 int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
			
 
				 {
			
 
				         //fpga_msg("The new troublesome point is here");
			
 
				-	
			
 
				+
			
 
				         int src_node = -1;
			
 
				 	unsigned i;
			
 
				 
			
@@ -152,7 +152,7 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
				 				i_ram = i;
			
 
				 			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
			
 
				 				i_disk = i;
			
 
				-                        if (starpu_node_get_kind(i) == STARPU_FPGA_RAM)			
			
 
				+                        if (starpu_node_get_kind(i) == STARPU_FPGA_RAM)
			
 
				 				i_fpga = i;
			
 
				 		}
			
 
				 	}
			
--- a/src/drivers/max/.old.tar.gz
+++ b/src/drivers/max/.old.tar.gz
--- a/src/drivers/max/driver_fpga.c
+++ b/src/drivers/max/driver_fpga.c
@@ -1,9 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2015, 2020  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				- * Copyright (C) 2011  Télécom-SudParis
			
 
				+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2010       Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2011       Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -47,14 +46,15 @@
 
				 
			
 
				 //#define STARPU_MAXFPGADEVS 4
			
 
				 /* the number of FPGA devices */
			
 
				-static unsigned  nfpgafpgas = -1; 
			
 
				+static unsigned  nfpgafpgas = -1;
			
 
				 static fpgaDeviceProp props[STARPU_MAXFPGADEVS];
			
 
				 static size_t global_mem[STARPU_MAXFPGADEVS] = { 128ULL*1024*1024*1024 };
			
 
				 
			
 
				 static void _starpu_fpga_limit_global_mem(unsigned );
			
 
				 static size_t _starpu_fpga_get_global_mem_size(unsigned devid);
			
 
				 
			
 
				-void fpga_msg(char *msg){
			
 
				+void fpga_msg(char *msg)
			
 
				+{
			
 
				 	printf(FPGA_OK "%s\n" NORMAL, msg);
			
 
				 }
			
 
				 
			
@@ -66,43 +66,49 @@ void _starpu_init_fpga()
 
				 	STARPU_ASSERT( nfpgafpgas <= STARPU_MAXFPGADEVS);
			
 
				 
			
 
				         //LMemInterface addLMemInterface()
			
 
				-        //// pour récupérer l'accès à la LMem	
			
 
				+        //// pour récupérer l'accès à la LMem
			
 
				 }
			
 
				 
			
 
				 #if 0
			
 
				-int fpga_allocate_memory(fpga_mem *ptr, size_t size){
			
 
				+int fpga_allocate_memory(fpga_mem *ptr, size_t size)
			
 
				+{
			
 
				 //This allocates BYTES
			
 
				 	char *msg1="You asked to allocate ";
			
 
				 //	printf(KCYN "%s%d*%d\n" KBLU, msg1,size,sizeof(unsigned));
			
 
				 	printf(FPGA_OK "%s%lu bytes\n" NORMAL, msg1,size);
			
 
				 
			
 
				 	*ptr =(fpga_mem) malloc(size);
			
 
				-  
			
 
				+
			
 
				         if (*ptr == NULL)
			
 
				         	return 0;
			
 
				-       		else
			
 
				+	else
			
 
				 		return 1;
			
 
				-       			  }
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				-int fpgaGetDeviceProperties(fpgaDeviceProp *props,unsigned devid){
			
 
				+int fpgaGetDeviceProperties(fpgaDeviceProp *props,unsigned devid)
			
 
				+{
			
 
				 //TODO
			
 
				         props->totalGlobalMem=1*1024*1024;
			
 
				         props->concurrentKernels=4;
			
 
				         props->name="Fpga_Props_Name";
			
 
				         return 0;
			
 
				 }
			
 
				-void _starpu_fpga_discover_devices (struct _starpu_machine_config *config){
			
 
				+
			
 
				+void _starpu_fpga_discover_devices (struct _starpu_machine_config *config)
			
 
				+{
			
 
				 	//TODO: This is statically assigned, in the next round of integration
			
 
				 	// I will have to read from the struct fpga in fpga
			
 
				 	config->topology.nhwfpgafpgas = nfpgafpgas;
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_fpga_get_device_count(void){
			
 
				+unsigned _starpu_fpga_get_device_count(void)
			
 
				+{
			
 
				 	return nfpgafpgas;
			
 
				 }
			
 
				 
			
 
				-static void	_starpu_fpga_limit_global_mem(unsigned devid){
			
 
				+static void _starpu_fpga_limit_global_mem(unsigned devid)
			
 
				+{
			
 
				 	starpu_ssize_t limit=-1;
			
 
				 
			
 
				 //TODO
			
@@ -111,11 +117,13 @@ static void	_starpu_fpga_limit_global_mem(unsigned devid){
 
				 		global_mem[devid] = limit*1024*1024;
			
 
				 }
			
 
				 
			
 
				-static size_t _starpu_fpga_get_global_mem_size(unsigned devid){
			
 
				+static size_t _starpu_fpga_get_global_mem_size(unsigned devid)
			
 
				+{
			
 
				 	return global_mem[devid];
			
 
				 }
			
 
				 
			
 
				-static void init_fpga_worker_context(unsigned workerid){
			
 
				+static void init_fpga_worker_context(unsigned workerid)
			
 
				+{
			
 
				 //		starpu_fpgaStreamCreate(&streams[devid][i]);
			
 
				 }
			
 
				 
			
@@ -125,7 +133,7 @@ static void init_device_context(unsigned devid)
 
				 	unsigned i;
			
 
				 //TODO: fpgaSetDevice
			
 
				 	fpgaSetDevice(devid);
			
 
				-	
			
 
				+
			
 
				 //TODO: fpgaGetDeviceProperties
			
 
				 	fpgaGetDeviceProperties(&props[devid], devid);
			
 
				 //TODO: Do we need the streams? I think no
			
@@ -138,8 +146,8 @@ static void init_device_context(unsigned devid)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-
			
 
				-int _starpu_fpga_driver_init(struct _starpu_worker *worker){
			
 
				+int _starpu_fpga_driver_init(struct _starpu_worker *worker)
			
 
				+{
			
 
				 	int devid = worker->devid;
			
 
				 //fpga_msg("successful till here");
			
 
				 	_starpu_driver_start(worker, _STARPU_FUT_CPU_KEY, 1);
			
@@ -163,7 +171,8 @@ int _starpu_fpga_driver_init(struct _starpu_worker *worker){
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int execute_job_on_fpga(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *fpga_args, int rank, struct starpu_perfmodel_arch* perf_arch){
			
 
				+static int execute_job_on_fpga(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *fpga_args, int rank, struct starpu_perfmodel_arch* perf_arch)
			
 
				+{
			
 
				 	int ret;
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 
			
@@ -220,9 +229,10 @@ int _starpu_fpga_driver_run_once(struct _starpu_worker *fpga_worker)
 
				 
			
 
				 	_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 	_starpu_datawizard_progress(1);
			
 
				-	if (memnode != STARPU_MAIN_RAM){
			
 
				+	if (memnode != STARPU_MAIN_RAM)
			
 
				+	{
			
 
				 		_starpu_datawizard_progress(1);
			
 
				-		}
			
 
				+	}
			
 
				 	_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 
			
 
				 	struct _starpu_job *j;
			
@@ -248,7 +258,7 @@ int _starpu_fpga_driver_run_once(struct _starpu_worker *fpga_worker)
 
				 	int is_parallel_task = (j->task_size > 1);
			
 
				 
			
 
				 	struct starpu_perfmodel_arch* perf_arch;
			
 
				-	
			
 
				+
			
 
				 	if (is_parallel_task)
			
 
				 	{
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
@@ -259,7 +269,7 @@ int _starpu_fpga_driver_run_once(struct _starpu_worker *fpga_worker)
 
				 		{
			
 
				 			struct _starpu_combined_worker *combined_worker;
			
 
				 			combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
			
 
				-			
			
 
				+
			
 
				 			fpga_worker->combined_workerid = j->combined_workerid;
			
 
				 			fpga_worker->worker_size = combined_worker->worker_size;
			
 
				 			fpga_worker->current_rank = rank;
			
@@ -315,7 +325,8 @@ int _starpu_fpga_driver_run_once(struct _starpu_worker *fpga_worker)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int _starpu_fpga_driver_deinit(struct _starpu_worker *fpga_worker){
			
 
				+int _starpu_fpga_driver_deinit(struct _starpu_worker *fpga_worker)
			
 
				+{
			
 
				 	_STARPU_TRACE_WORKER_DEINIT_START;
			
 
				 
			
 
				 	unsigned memnode = fpga_worker->memory_node;
			
@@ -332,7 +343,8 @@ int _starpu_fpga_driver_deinit(struct _starpu_worker *fpga_worker){
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void *_starpu_fpga_worker(void *_arg){
			
 
				+void *_starpu_fpga_worker(void *_arg)
			
 
				+{
			
 
				 	struct _starpu_worker* worker = _arg;
			
 
				          unsigned memnode = worker->memory_node;
			
 
				 
			
@@ -341,7 +353,7 @@ void *_starpu_fpga_worker(void *_arg){
 
				 	while (_starpu_machine_is_running())
			
 
				 	{
			
 
				 		_starpu_may_pause();
			
 
				-//fpga_msg("\tEntered the main loop\n");
			
 
				+		//fpga_msg("\tEntered the main loop\n");
			
 
				 		_starpu_fpga_driver_run_once(worker);
			
 
				 	}
			
 
				 	_STARPU_TRACE_END_PROGRESS(memnode);
			
@@ -350,7 +362,7 @@ void *_starpu_fpga_worker(void *_arg){
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-uintptr_t _starpu_fpga_allocate_memory(unsigned dst_node, size_t size, int flags) 
			
 
				+uintptr_t _starpu_fpga_allocate_memory(unsigned dst_node, size_t size, int flags)
			
 
				 {
			
 
				 	(void) flags;
			
 
				 	unsigned devid = starpu_memory_node_get_devid(dst_node);
			
@@ -363,19 +375,17 @@ uintptr_t _starpu_fpga_allocate_memory(unsigned dst_node, size_t size, int flags
 
				 // TODO: vérifier si current_address + size > taille de la LMEm
			
 
				  	addr = current_address;
			
 
				 	current_address += size;
			
 
				-printf("fpga mem returned from allocation @: %p - %p\n",addr, addr + size);
			
 
				-//success = 0
			
 
				+	printf("fpga mem returned from allocation @: %p - %p\n",addr, addr + size);
			
 
				+	//success = 0
			
 
				         return (uintptr_t) addr;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-
			
 
				 int _starpu_fpga_copy_ram_to_fpga(void *src, void *dst, size_t size)
			
 
				 {
			
 
				-printf("ram to fpga, fpga @= %p\n",dst);
			
 
				+	printf("ram to fpga, fpga @= %p\n",dst);
			
 
				 	memcpy(dst,src,size);
			
 
				 	return 0;
			
 
				-  // LMemLoopback_writeLMem(dst, size, src);
			
 
				+	// LMemLoopback_writeLMem(dst, size, src);
			
 
				 }
			
 
				 
			
 
				 /* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
			
@@ -383,52 +393,51 @@ printf("ram to fpga, fpga @= %p\n",dst);
 
				  *   */
			
 
				 void copy_ram_to_fpga(void *src, void *dst, size_t size)
			
 
				 {
			
 
				-printf("ram to fpga, fpga @= %p\n",dst);
			
 
				-
			
 
				-      // LMemLoopback_writeLMem(size, dst, src);
			
 
				-   
			
 
				+	printf("ram to fpga, fpga @= %p\n",dst);
			
 
				+	// LMemLoopback_writeLMem(size, dst, src);
			
 
				 }
			
 
				 
			
 
				 void copy_fpga_to_ram(void *src, void *dst, size_t size)
			
 
				 {
			
 
				-printf("ram to fpga, fpga @= %p\n",src);
			
 
				+	printf("ram to fpga, fpga @= %p\n",src);
			
 
				        //LMemLoopback_readLMem(size, src, dst);
			
 
				 
			
 
				 }
			
 
				+
			
 
				 /* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
			
 
				  * node to the address pointed by DST in the DST_NODE memory node
			
 
				  */
			
 
				 int _starpu_fpga_copy_fpga_to_ram(void *src, void *dst, size_t size)
			
 
				 {
			
 
				-printf("fpga to ram, fpga @= %p\n",src);
			
 
				+	printf("fpga to ram, fpga @= %p\n",src);
			
 
				 	memcpy(dst,src,size);
			
 
				 	return 0;
			
 
				- //LMemLoopback_readLMem(src, size, dst);
			
 
				+	//LMemLoopback_readLMem(src, size, dst);
			
 
				 }
			
 
				+
			
 
				 /* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
			
 
				  * node to the address pointed by DST in the DST_NODE memory node
			
 
				  */
			
 
				 int _starpu_fpga_copy_fpga_to_fpga(void *src, void *dst, size_t size)
			
 
				 {
			
 
				-printf("fpga to ram, fpga @= %p\n",src);
			
 
				+	printf("fpga to ram, fpga @= %p\n",src);
			
 
				 	memcpy(dst,src,size);
			
 
				 	return 0;
			
 
				- //LMemLoopback_XXXLMem(src, size, dst);
			
 
				+	//LMemLoopback_XXXLMem(src, size, dst);
			
 
				 }
			
 
				 
			
 
				 /* Asynchronous transfers */
			
 
				 int _starpu_fpga_copy_ram_to_fpga_async(void *src, void *dst, size_t size)
			
 
				 {
			
 
				-printf("ram to fpga, fpga @= %p\n",dst);
			
 
				+	printf("ram to fpga, fpga @= %p\n",dst);
			
 
				 	memcpy(dst,src,size);
			
 
				 	return 0;
			
 
				-
			
 
				- // Trouver dans la doc une version asynchrone de LMemLoopback_writeLMem();
			
 
				+	// Trouver dans la doc une version asynchrone de LMemLoopback_writeLMem();
			
 
				 }
			
 
				 
			
 
				 int _starpu_fpga_copy_fpga_to_ram_async(void *src, void *dst, size_t size)
			
 
				 {
			
 
				-printf("fpga to ram, fpga @= %p\n",src);
			
 
				+	printf("fpga to ram, fpga @= %p\n",src);
			
 
				 	memcpy(dst,src,size);
			
 
				 	return 0;
			
 
				 }
			
@@ -441,7 +450,8 @@ int _starpu_run_fpga(struct _starpu_worker *workerarg)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void _starpu_fpga_transfer_data(void *buffers[], struct _starpu_job *j, int chnl){
			
 
				+void _starpu_fpga_transfer_data(void *buffers[], struct _starpu_job *j, int chnl)
			
 
				+{
			
 
				 	struct starpu_task *task = j->task;
			
 
				 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				 
			
@@ -449,7 +459,7 @@ void _starpu_fpga_transfer_data(void *buffers[], struct _starpu_job *j, int chnl
 
				 	for (index = 0; index < nbuffers; index++)
			
 
				 	{
			
 
				 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
			
 
				-//		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
			
 
				+		//		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
			
 
				 		unsigned *interface_id = (unsigned *)malloc(sizeof(unsigned));
			
 
				 		*interface_id = handle->ops->interfaceid;
			
 
				 
			
@@ -560,7 +570,6 @@ int _starpu_fpga_copy_interface_from_cpu_to_fpga(starpu_data_handle_t handle, vo
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 struct _starpu_driver_ops _starpu_driver_fpga_ops =
			
 
				 {
			
 
				 	.init = _starpu_fpga_driver_init,
			
@@ -587,7 +596,7 @@ struct _starpu_node_ops _starpu_driver_fpga_node_ops =
 
				 	.copy_interface_to[STARPU_DISK_RAM] = NULL,
			
 
				 	.copy_interface_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_interface_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				-        
			
 
				+
			
 
				         .wait_request_completion = NULL,
			
 
				 	.test_request_completion = NULL,
			
 
				 	.is_direct_access_supported = NULL,
			
--- a/src/drivers/max/driver_fpga.h
+++ b/src/drivers/max/driver_fpga.h
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2012-2014, 2020  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012  CNRS
			
 
				+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,9 +17,9 @@
 
				 #ifndef __DRIVER_FPGA_H__
			
 
				 #define __DRIVER_FPGA_H__
			
 
				 //#ifdef NOT_DEFINED
			
 
				-	#ifdef STARPU_USE_FPGA
			
 
				-	#include <starpu_fpga.h>
			
 
				-	#endif
			
 
				+#ifdef STARPU_USE_FPGA
			
 
				+#include <starpu_fpga.h>
			
 
				+#endif
			
 
				 //#endif
			
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
@@ -37,7 +36,6 @@ typedef unsigned * fpga_mem;
 
				 extern struct _starpu_driver_ops _starpu_driver_fpga_ops;
			
 
				 extern struct _starpu_node_ops _starpu_driver_fpga_node_ops;
			
 
				 
			
 
				-
			
 
				 int fpgaSetDevice(unsigned devid);
			
 
				 void _starpu_init_fpga(void);
			
 
				 void _starpu_fpga_discover_devices (struct _starpu_machine_config *config);
			
@@ -61,8 +59,10 @@ int _starpu_fpga_copy_fpga_to_ram_async(void *src, void *dst, size_t size);
 
				 
			
 
				 int _starpu_fpga_copy_interface_from_cpu_to_fpga(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
			
 
				 int _starpu_fpga_copy_data_from_cpu_to_fpga(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t ssize, struct _starpu_async_channel *async_channel);
			
 
				+
			
 
				 #else
			
 
				 #define _starpu_fpga_discover_devices(config) ((void) (config))
			
 
				 #endif
			
 
				+
			
 
				 #endif //  __DRIVER_FPGA_H__
			
 
				 
			
--- a/tests/perfmodels/LMemLoopbackCpuCode.c
+++ b/tests/perfmodels/LMemLoopbackCpuCode.c
@@ -16,8 +16,10 @@
 
				 int check(int size, int32_t *outData, int32_t *inA, int32_t *inB)
			
 
				 {
			
 
				 	int status = 0;
			
 
				-	for (int i = 0; i < size; i++) {
			
 
				-		if (outData[i] != inA[i] + inB[i]) {
			
 
				+	for (int i = 0; i < size; i++)
			
 
				+	{
			
 
				+		if (outData[i] != inA[i] + inB[i])
			
 
				+		{
			
 
				 			fprintf(stderr, "[%d] Verification error, out: %u != expected: %u\n",
			
 
				 				i, outData[i], inA[i] + inB[i]);
			
 
				 			status = 1;
			
@@ -33,7 +35,8 @@ int main()
 
				 	int32_t *inA = (int32_t*) malloc(sizeBytes);
			
 
				 	int32_t *inB = (int32_t*) malloc(sizeBytes);
			
 
				 
			
 
				-	for (int i = 0; i < size; i++) {
			
 
				+	for (int i = 0; i < size; i++)
			
 
				+	{
			
 
				 		inA[i] = i;
			
 
				 		inB[i] = size - i;
			
 
				 	}
			
--- a/tests/perfmodels/max_fpga.c
+++ b/tests/perfmodels/max_fpga.c
@@ -1,3 +1,18 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <stdio.h>
			
@@ -10,306 +25,280 @@
 
				 
			
 
				 
			
 
				 void fpga_impl(void *buffers[], void *cl_arg)
			
 
				-{   
			
 
				-    (void)cl_arg;
			
 
				-    
			
 
				-    int32_t *ptrA = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-    int32_t *ptrB = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				-    int32_t *ptrC = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[2]);
			
 
				-
			
 
				-int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
			
 
				-
			
 
				-
			
 
				-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    
			
 
				-    int sizeBytes=SIZE *sizeof(int32_t);
			
 
				-    size_t LMemsize= SIZE *sizeof(int32_t);
			
 
				-    
			
 
				-    size_t poubelle = 0xc0000;
			
 
				-
			
 
				-    size_t ptrCT1 = 0x00000000000000c0;
			
 
				-
			
 
				-    size_t ptrAT2 = ptrCT1;
			
 
				-    size_t ptrBT2 = ptrCT1;
			
 
				-    size_t ptrCT2 = 0x0000000000000180;
			
 
				-
			
 
				-    size_t ptrAT3 = ptrCT2;
			
 
				-    size_t ptrBT3 = ptrCT2;
			
 
				-
			
 
				-    printf("Loading DFE memory.\n");
			
 
				-
			
 
				-/* C = A+B */
			
 
				-    StreamFMA(SIZE, ptrA, sizeBytes, ptrB, sizeBytes, poubelle_cpu, sizeBytes,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrCT1, LMemsize,
			
 
				-    poubelle, LMemsize);
			
 
				-printf("T1 finished\n");
			
 
				-
			
 
				-/* C = A*B */
			
 
				-    StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes,
			
 
				-    ptrAT2, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrBT2, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrCT2, LMemsize);
			
 
				-printf("T2 finished\n");
			
 
				-
			
 
				-/* C = A+B */
			
 
				-    StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, ptrC, sizeBytes,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrAT3, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrBT3, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize);
			
 
				-printf("T3 finished\n");
			
 
				-
			
 
				-    printf("Running DFE.\n");
			
 
				-              
			
 
				-  }
			
 
				+{
			
 
				+	(void)cl_arg;
			
 
				+
			
 
				+	int32_t *ptrA = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	int32_t *ptrB = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				+	int32_t *ptrC = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[2]);
			
 
				+
			
 
				+	int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
			
 
				+	int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+
			
 
				+	int sizeBytes=SIZE *sizeof(int32_t);
			
 
				+	size_t LMemsize= SIZE *sizeof(int32_t);
			
 
				+
			
 
				+	size_t poubelle = 0xc0000;
			
 
				+
			
 
				+	size_t ptrCT1 = 0x00000000000000c0;
			
 
				+
			
 
				+	size_t ptrAT2 = ptrCT1;
			
 
				+	size_t ptrBT2 = ptrCT1;
			
 
				+	size_t ptrCT2 = 0x0000000000000180;
			
 
				+
			
 
				+	size_t ptrAT3 = ptrCT2;
			
 
				+	size_t ptrBT3 = ptrCT2;
			
 
				+
			
 
				+	printf("Loading DFE memory.\n");
			
 
				+
			
 
				+	/* C = A+B */
			
 
				+	StreamFMA(SIZE, ptrA, sizeBytes, ptrB, sizeBytes, poubelle_cpu, sizeBytes,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrCT1, LMemsize,
			
 
				+		  poubelle, LMemsize);
			
 
				+	printf("T1 finished\n");
			
 
				+
			
 
				+	/* C = A*B */
			
 
				+	StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes,
			
 
				+		  ptrAT2, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrBT2, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrCT2, LMemsize);
			
 
				+	printf("T2 finished\n");
			
 
				+
			
 
				+	/* C = A+B */
			
 
				+	StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, ptrC, sizeBytes,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrAT3, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrBT3, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize);
			
 
				+	printf("T3 finished\n");
			
 
				+
			
 
				+	printf("Running DFE.\n");
			
 
				 
			
 
				+}
			
 
				 
			
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				-    .fpga_funcs = {fpga_impl},
			
 
				-  
			
 
				-    .nbuffers = 3,
			
 
				-    .modes = {STARPU_R, STARPU_R, STARPU_W}
			
 
				+ 	.fpga_funcs = {fpga_impl},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W}
			
 
				 };
			
 
				 
			
 
				-
			
 
				-
			
 
				 void fpga_impl1(void *buffers[], void *cl_arg)
			
 
				-{   
			
 
				-    (void)cl_arg;
			
 
				-    
			
 
				-    int32_t *ptrA = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-    int32_t *ptrB = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				-    size_t   ptrC = (size_t)   STARPU_VECTOR_GET_PTR(buffers[2]); /* FPGA */
			
 
				+{
			
 
				+	(void)cl_arg;
			
 
				+
			
 
				+	int32_t *ptrA = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	int32_t *ptrB = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				+	size_t   ptrC = (size_t)   STARPU_VECTOR_GET_PTR(buffers[2]); /* FPGA */
			
 
				 
			
 
				-int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
			
 
				+	int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
			
 
				+	int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				 
			
 
				+	int sizeBytes=SIZE *sizeof(int32_t);
			
 
				+	size_t LMemsize= SIZE *sizeof(int32_t);
			
 
				 
			
 
				-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    
			
 
				-    int sizeBytes=SIZE *sizeof(int32_t);
			
 
				-    size_t LMemsize= SIZE *sizeof(int32_t);
			
 
				-    
			
 
				-    size_t poubelle = 0xc0000;
			
 
				+	size_t poubelle = 0xc0000;
			
 
				 
			
 
				 #if 0
			
 
				-printf("T1 with %p %p %zu\n", ptrA, ptrB, ptrC);
			
 
				-//XXX
			
 
				-    ptrC = 0x00000000000000c0;
			
 
				+	printf("T1 with %p %p %zu\n", ptrA, ptrB, ptrC);
			
 
				+	//XXX
			
 
				+	ptrC = 0x00000000000000c0;
			
 
				 #endif
			
 
				 
			
 
				-printf("T1 with %p %p %zu\n", ptrA, ptrB, ptrC);
			
 
				-/* C = A+B */
			
 
				-    StreamFMA(SIZE, ptrA, sizeBytes, ptrB, sizeBytes, poubelle_cpu, sizeBytes,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrC, LMemsize,
			
 
				-    poubelle, LMemsize);
			
 
				-printf("T1 finished\n");
			
 
				-
			
 
				-  }
			
 
				-
			
 
				+	printf("T1 with %p %p %zu\n", ptrA, ptrB, ptrC);
			
 
				+	/* C = A+B */
			
 
				+	StreamFMA(SIZE, ptrA, sizeBytes, ptrB, sizeBytes, poubelle_cpu, sizeBytes,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrC, LMemsize,
			
 
				+		  poubelle, LMemsize);
			
 
				+	printf("T1 finished\n");
			
 
				+}
			
 
				 
			
 
				 static struct starpu_codelet cl1 =
			
 
				 {
			
 
				-    .fpga_funcs = {fpga_impl1},
			
 
				-  
			
 
				-    .nbuffers = 3,
			
 
				-    .modes = {STARPU_R, STARPU_R, STARPU_W},
			
 
				-    .specific_nodes = 1,
			
 
				-    .nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
			
 
				+ 	.fpga_funcs = {fpga_impl1},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W},
			
 
				+	.specific_nodes = 1,
			
 
				+	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
			
 
				 };
			
 
				 
			
 
				 void fpga_impl2(void *buffers[], void *cl_arg)
			
 
				-{   
			
 
				-    (void)cl_arg;
			
 
				-    
			
 
				-    size_t ptrA = (size_t) STARPU_VECTOR_GET_PTR(buffers[0]); /* FPGA */
			
 
				-    size_t ptrB = (size_t) STARPU_VECTOR_GET_PTR(buffers[1]); /* FPGA */
			
 
				-    size_t ptrC = (size_t) STARPU_VECTOR_GET_PTR(buffers[2]); /* FPGA */
			
 
				+{
			
 
				+	(void)cl_arg;
			
 
				 
			
 
				-int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
			
 
				+	size_t ptrA = (size_t) STARPU_VECTOR_GET_PTR(buffers[0]); /* FPGA */
			
 
				+	size_t ptrB = (size_t) STARPU_VECTOR_GET_PTR(buffers[1]); /* FPGA */
			
 
				+	size_t ptrC = (size_t) STARPU_VECTOR_GET_PTR(buffers[2]); /* FPGA */
			
 
				 
			
 
				+	int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
			
 
				+	int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				 
			
 
				-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    
			
 
				-    int sizeBytes=SIZE *sizeof(int32_t);
			
 
				-    size_t LMemsize= SIZE *sizeof(int32_t);
			
 
				-    
			
 
				-    size_t poubelle = 0xc0000;
			
 
				+	int sizeBytes=SIZE *sizeof(int32_t);
			
 
				+	size_t LMemsize= SIZE *sizeof(int32_t);
			
 
				+
			
 
				+	size_t poubelle = 0xc0000;
			
 
				 
			
 
				 #if 0
			
 
				-printf("T2 with %zu %zu %zu\n", ptrA, ptrB, ptrC);
			
 
				-//XXX
			
 
				-    ptrA = 0x00000000000000c0;
			
 
				-    ptrB = 0x00000000000000c0;
			
 
				-    ptrC = 0x0000000000000180;
			
 
				+	printf("T2 with %zu %zu %zu\n", ptrA, ptrB, ptrC);
			
 
				+	//XXX
			
 
				+	ptrA = 0x00000000000000c0;
			
 
				+	ptrB = 0x00000000000000c0;
			
 
				+	ptrC = 0x0000000000000180;
			
 
				 #endif
			
 
				 
			
 
				-printf("T2 with %zu %zu %zu\n", ptrA, ptrB, ptrC);
			
 
				-/* C = A*B */
			
 
				-    StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes,
			
 
				-    ptrA, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrB, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrC, LMemsize);
			
 
				-printf("T2 finished\n");
			
 
				-              
			
 
				-  }
			
 
				+	printf("T2 with %zu %zu %zu\n", ptrA, ptrB, ptrC);
			
 
				+	/* C = A*B */
			
 
				+	StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes,
			
 
				+		  ptrA, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrB, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrC, LMemsize);
			
 
				+	printf("T2 finished\n");
			
 
				+}
			
 
				 
			
 
				 static struct starpu_codelet cl2 =
			
 
				 {
			
 
				-    .fpga_funcs = {fpga_impl2},
			
 
				-  
			
 
				-    .nbuffers = 3,
			
 
				-    .modes = {STARPU_R, STARPU_R, STARPU_W}
			
 
				-    /* local by default */
			
 
				+ 	.fpga_funcs = {fpga_impl2},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W}
			
 
				+	/* local by default */
			
 
				 };
			
 
				 
			
 
				 void fpga_impl3(void *buffers[], void *cl_arg)
			
 
				-{   
			
 
				-    (void)cl_arg;
			
 
				-    
			
 
				-    size_t   ptrA = (size_t)   STARPU_VECTOR_GET_PTR(buffers[0]); /* FPGA */
			
 
				-    size_t   ptrB = (size_t)   STARPU_VECTOR_GET_PTR(buffers[1]); /* FPGA */
			
 
				-    int32_t *ptrC = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[2]);
			
 
				+{
			
 
				+	(void)cl_arg;
			
 
				+
			
 
				+	size_t   ptrA = (size_t)   STARPU_VECTOR_GET_PTR(buffers[0]); /* FPGA */
			
 
				+	size_t   ptrB = (size_t)   STARPU_VECTOR_GET_PTR(buffers[1]); /* FPGA */
			
 
				+	int32_t *ptrC = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[2]);
			
 
				 
			
 
				-int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
			
 
				+	int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
			
 
				+	int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				 
			
 
				+	int sizeBytes=SIZE *sizeof(int32_t);
			
 
				+	size_t LMemsize= SIZE *sizeof(int32_t);
			
 
				 
			
 
				-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    
			
 
				-    int sizeBytes=SIZE *sizeof(int32_t);
			
 
				-    size_t LMemsize= SIZE *sizeof(int32_t);
			
 
				-    
			
 
				-    size_t poubelle = 0xc0000;
			
 
				+	size_t poubelle = 0xc0000;
			
 
				 
			
 
				 #if 0
			
 
				-printf("T3 with %zu %zu %p\n", ptrA, ptrB, ptrC);
			
 
				-//XXX
			
 
				-    ptrA = 0x0000000000000180;
			
 
				-    ptrB = 0x0000000000000180;
			
 
				+	printf("T3 with %zu %zu %p\n", ptrA, ptrB, ptrC);
			
 
				+	//XXX
			
 
				+	ptrA = 0x0000000000000180;
			
 
				+	ptrB = 0x0000000000000180;
			
 
				 #endif
			
 
				 
			
 
				-printf("T3 with %zu %zu %p\n", ptrA, ptrB, ptrC);
			
 
				-/* C = A+B */
			
 
				-    StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, ptrC, sizeBytes,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrA, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    ptrB, LMemsize,
			
 
				-    poubelle, LMemsize,
			
 
				-    poubelle, LMemsize);
			
 
				-printf("T3 finished\n");
			
 
				-  }
			
 
				+	printf("T3 with %zu %zu %p\n", ptrA, ptrB, ptrC);
			
 
				+	/* C = A+B */
			
 
				+	StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, ptrC, sizeBytes,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrA, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  ptrB, LMemsize,
			
 
				+		  poubelle, LMemsize,
			
 
				+		  poubelle, LMemsize);
			
 
				+	printf("T3 finished\n");
			
 
				+}
			
 
				 
			
 
				 static struct starpu_codelet cl3 =
			
 
				 {
			
 
				-    .fpga_funcs = {fpga_impl3},
			
 
				-  
			
 
				-    .nbuffers = 3,
			
 
				-    .modes = {STARPU_R, STARPU_R, STARPU_W},
			
 
				-    .specific_nodes = 1,
			
 
				-    .nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_CPU},
			
 
				+ 	.fpga_funcs = {fpga_impl3},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W},
			
 
				+	.specific_nodes = 1,
			
 
				+	.nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_CPU},
			
 
				 };
			
 
				 
			
 
				-
			
 
				-
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				+	/* Enable profiling */
			
 
				+	starpu_profiling_status_set(1);
			
 
				+
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_data_handle_t handle_a, handle_b, handle_ct1, handle_ct2, handle_c;
			
 
				+	int ret;
			
 
				+	int size=1234;
			
 
				+
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.sched_policy_name = "eager";
			
 
				+	conf.calibrate = 0;
			
 
				+
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	int32_t a[SIZE];
			
 
				+	int32_t b[SIZE];
			
 
				+	int32_t c[SIZE];
			
 
				+
			
 
				+	int i;
			
 
				+	for(i = 0; i < SIZE; ++i)
			
 
				+	{
			
 
				+		a[i] = random() % 100;
			
 
				+		b[i] = random() % 100;
			
 
				+	}
			
 
				 
			
 
				-    /* Enable profiling */
			
 
				-    starpu_profiling_status_set(1);
			
 
				-
			
 
				-    struct starpu_conf conf;
			
 
				-    starpu_data_handle_t handle_a, handle_b, handle_ct1, handle_ct2, handle_c;
			
 
				-    int ret;
			
 
				-    int size=1234;
			
 
				-
			
 
				-    starpu_conf_init(&conf);
			
 
				-
			
 
				-    conf.sched_policy_name = "eager";
			
 
				-    conf.calibrate = 0;
			
 
				-
			
 
				-    ret = starpu_initialize(&conf, &argc, &argv);
			
 
				-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-    
			
 
				-    int32_t a[SIZE];
			
 
				-    int32_t b[SIZE];
			
 
				-    int32_t c[SIZE];
			
 
				-
			
 
				-    int i;
			
 
				-    for(i = 0; i < SIZE; ++i)
			
 
				-    {
			
 
				-        a[i] = random() % 100;
			
 
				-        b[i] = random() % 100;
			
 
				-    }
			
 
				-
			
 
				-    starpu_vector_data_register(&handle_a, STARPU_MAIN_RAM, (uintptr_t) &a, SIZE, sizeof(a[0]));
			
 
				-    starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(b[0]));
			
 
				+	starpu_vector_data_register(&handle_a, STARPU_MAIN_RAM, (uintptr_t) &a, SIZE, sizeof(a[0]));
			
 
				+	starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(b[0]));
			
 
				 
			
 
				-    starpu_vector_data_register(&handle_ct1, -1, 0, SIZE, sizeof(c[0]));
			
 
				-    starpu_vector_data_register(&handle_ct2, -1, 0, SIZE, sizeof(c[0]));
			
 
				+	starpu_vector_data_register(&handle_ct1, -1, 0, SIZE, sizeof(c[0]));
			
 
				+	starpu_vector_data_register(&handle_ct2, -1, 0, SIZE, sizeof(c[0]));
			
 
				 
			
 
				-    starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(c[0]));
			
 
				+	starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(c[0]));
			
 
				 
			
 
				 #if 0
			
 
				-    ret = starpu_task_insert(&cl, STARPU_R, handle_a, STARPU_R, handle_b, STARPU_W, handle_c, STARPU_TASK_SYNCHRONOUS, 1, 0);
			
 
				-    fprintf(stderr,"task submitted %d\n", ret);
			
 
				+	ret = starpu_task_insert(&cl, STARPU_R, handle_a, STARPU_R, handle_b, STARPU_W, handle_c, STARPU_TASK_SYNCHRONOUS, 1, 0);
			
 
				+	fprintf(stderr,"task submitted %d\n", ret);
			
 
				 #else
			
 
				-    ret = starpu_task_insert(&cl1, STARPU_R, handle_a, STARPU_R, handle_b, STARPU_W, handle_ct1, 0);
			
 
				-    fprintf(stderr,"task submitted %d\n", ret);
			
 
				-    ret = starpu_task_insert(&cl2, STARPU_R, handle_ct1, STARPU_R, handle_ct1, STARPU_W, handle_ct2, 0);
			
 
				-    fprintf(stderr,"task submitted %d\n", ret);
			
 
				-    ret = starpu_task_insert(&cl3, STARPU_R, handle_ct2, STARPU_R, handle_ct2, STARPU_W, handle_c, 0);
			
 
				-    fprintf(stderr,"task submitted %d\n", ret);
			
 
				+	ret = starpu_task_insert(&cl1, STARPU_R, handle_a, STARPU_R, handle_b, STARPU_W, handle_ct1, 0);
			
 
				+	fprintf(stderr,"task submitted %d\n", ret);
			
 
				+	ret = starpu_task_insert(&cl2, STARPU_R, handle_ct1, STARPU_R, handle_ct1, STARPU_W, handle_ct2, 0);
			
 
				+	fprintf(stderr,"task submitted %d\n", ret);
			
 
				+	ret = starpu_task_insert(&cl3, STARPU_R, handle_ct2, STARPU_R, handle_ct2, STARPU_W, handle_c, 0);
			
 
				+	fprintf(stderr,"task submitted %d\n", ret);
			
 
				 #endif
			
 
				-    
			
 
				-    starpu_data_unregister(handle_a);
			
 
				-    starpu_data_unregister(handle_b);
			
 
				-    starpu_data_unregister(handle_c);
			
 
				-    
			
 
				-    ret = EXIT_SUCCESS;
			
 
				-
			
 
				-    for (i = 0; i < SIZE; ++i) 
			
 
				-    {
			
 
				-	int ct1 = a[i] + b[i];
			
 
				-	int ct2 = ct1 * ct1;
			
 
				-	int ct3 = ct2 + ct2;
			
 
				-
			
 
				-	if (c[i] != ct3)
			
 
				-	    ret = EXIT_FAILURE;
			
 
				-
			
 
				-	if (i < 10) {
			
 
				-	    printf("%d == %d\n", c[i], ct3);
			
 
				-	    if (c[i] != ct3)
			
 
				-		printf("OOOPS\n");
			
 
				-	}
			
 
				-    }
			
 
				 
			
 
				+	starpu_data_unregister(handle_a);
			
 
				+	starpu_data_unregister(handle_b);
			
 
				+	starpu_data_unregister(handle_c);
			
 
				 
			
 
				-    starpu_shutdown();
			
 
				+	ret = EXIT_SUCCESS;
			
 
				 
			
 
				-    if (ret == EXIT_SUCCESS)
			
 
				-	printf("OK!\n");
			
 
				+	for (i = 0; i < SIZE; ++i)
			
 
				+	{
			
 
				+		int ct1 = a[i] + b[i];
			
 
				+		int ct2 = ct1 * ct1;
			
 
				+		int ct3 = ct2 + ct2;
			
 
				 
			
 
				-    return ret;
			
 
				+		if (c[i] != ct3)
			
 
				+			ret = EXIT_FAILURE;
			
 
				 
			
 
				-}
			
 
				+		if (i < 10)
			
 
				+		{
			
 
				+			printf("%d == %d\n", c[i], ct3);
			
 
				+			if (c[i] != ct3)
			
 
				+				printf("OOOPS\n");
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				 
			
 
				+	if (ret == EXIT_SUCCESS)
			
 
				+		printf("OK!\n");
			
 
				 
			
 
				+	return ret;
			
 
				+}
			
--- a/tests/perfmodels/opencl_memset_01.c
+++ b/tests/perfmodels/opencl_memset_01.c
@@ -1,8 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012                                     Inria
			
 
				- * Copyright (C) 2012,2015-2017                           CNRS
			
 
				- * Copyright (C) 2014,2016                                Université de Bordeaux
			
 
				+ * Copyright (C) 2012-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/tests/perfmodels/opencl_memset_kernel_01.cl
+++ b/tests/perfmodels/opencl_memset_kernel_01.cl
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012                                     Inria
			
 
				- * Copyright (C) 2012,2015,2017                           CNRS
			
 
				+ * Copyright (C) 2012-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/tests/perfmodels/regression_based_01.c
+++ b/tests/perfmodels/regression_based_01.c
@@ -1,8 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012,2016                                Inria
			
 
				- * Copyright (C) 2010-2015,2017                           Université de Bordeaux
			
 
				- * Copyright (C) 2010-2013,2015,2017                      CNRS
			
 
				+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/tests/perfmodels/regression_based_02.c
+++ b/tests/perfmodels/regression_based_02.c
@@ -1,9 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011,2012,2014                           Inria
			
 
				- * Copyright (C) 2011-2016,2019                           Université de Bordeaux
			
 
				- * Copyright (C) 2011-2017                                CNRS
			
 
				- * Copyright (C) 2011                                     Télécom-SudParis
			
 
				+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2011       Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/tests/perfmodels/regression_based_03.c
+++ b/tests/perfmodels/regression_based_03.c
@@ -1,9 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011,2012,2014                           Inria
			
 
				- * Copyright (C) 2011-2016,2019                           Université de Bordeaux
			
 
				- * Copyright (C) 2011-2017                                CNRS
			
 
				- * Copyright (C) 2011                                     Télécom-SudParis
			
 
				+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2011       Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by