4 years ago · 80265025c0
--- a/configure.ac
+++ b/configure.ac
@@ -114,28 +114,25 @@ fi
 
				 ###############################################################################
			
 
				 
			
 
				 #with or without automatic data transfers
			
 
				-AC_MSG_CHECKING(Automatic Data transfers for Fpga driver)
			
 
				-AC_ARG_ENABLE(autofpga, [AS_HELP_STRING([--enable-autofpga=<number>],
			
 
				-			[set 0 to disable])],
			
 
				-			autofpga=$enableval, autofpga=1)
			
 
				+AC_MSG_CHECKING(automatic data transfers for FPGA driver)
			
 
				+AC_ARG_ENABLE(autofpga, [AS_HELP_STRING([--disable-autofpga],
			
 
				+			[disable automatic data transfers for FPGA driver])],
			
 
				+			autofpga=$enableval, autofpga=yes)
			
 
				 AC_MSG_RESULT($autofpga)
			
 
				-
			
 
				-AC_DEFINE_UNQUOTED(STARPU_AUTOFPGA, [$autofpga],
			
 
				-		[automatic data transfer for Fpga])
			
 
				+if test x$autofpga = xyes ; then
			
 
				+   AC_DEFINE(STARPU_AUTOFPGA, [1], [automatic data transfer for Fpga])
			
 
				+fi
			
 
				 
			
 
				 #NUMBER OF FPGA DEVICES
			
 
				-AC_MSG_CHECKING(maximum number of Fpga devices)
			
 
				+AC_MSG_CHECKING(maximum number of FPGA devices)
			
 
				 AC_ARG_ENABLE(maxfpgadev, [AS_HELP_STRING([--enable-maxfpgadev=<number>],
			
 
				 			[maximum number of FPGA devices])],
			
 
				 			nmaxfpgadev=$enableval, nmaxfpgadev=12)
			
 
				 AC_MSG_RESULT($nmaxfpga)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_MAXFPGADEVS, [$nmaxfpgadev],[maximum number of FPGA devices])
			
 
				 
			
 
				-#nmaxfpgadev=60
			
 
				-AC_DEFINE_UNQUOTED(STARPU_MAXFPGADEVS, [$nmaxfpgadev],
			
 
				-		[maximum number of Fpga devices])
			
 
				-		
			
 
				 AC_ARG_WITH([fpga],
			
 
				-	[AS_HELP_STRING([--with-fpga=<path>],[specify where Fpga lib is installed])],
			
 
				+	[AS_HELP_STRING([--with-fpga=<path>],[specify where FPGA lib is installed])],
			
 
				 	[fpga_dir="$withval"
			
 
				 	enable_fpga=yes],
			
 
				 	[enable_fpga=no]
			
@@ -146,17 +143,8 @@ AC_ARG_ENABLE(link-with-riffa, [AS_HELP_STRING([--disable-link-with-riffa],
 
				 AC_ARG_ENABLE(link-with-maxeler, [AS_HELP_STRING([--disable-link-with-maxeler],
			
 
				 	      [link with MAXELER])], [link_with_maxeler=$enableval], [link_with_maxeler=yes])
			
 
				 
			
 
				-
			
 
				-AC_MSG_CHECKING(whether Fpga should be used)
			
 
				-AC_MSG_RESULT($enable_fpga)
			
 
				-AC_SUBST(STARPU_USE_FPGA,$enable_fpga)
			
 
				-AM_CONDITIONAL(STARPU_USE_FPGA,test x$enable_fpga = xyes)
			
 
				-if test x$enable_fpga = xyes; then
			
 
				-   	AC_DEFINE(STARPU_USE_FPGA,[1],[Trying to set STARPU_USE_FPGA])
			
 
				-fi
			
 
				-
			
 
				-if test x$enable_fpga = xyes; then
			
 
				-
			
 
				+if test x$enable_fpga = xyes
			
 
				+then
			
 
				    	fpga_include_dir="${fpga_dir}/include"
			
 
				 	fpga_lib_dir="${fpga_dir}/lib"
			
 
				 
			
@@ -170,7 +158,7 @@ if test x$enable_fpga = xyes; then
 
				 	   STARPU_FPGA_CPPFLAGS="`slic-config --cflags | sed s/\'//g | sed "s/-I /-I/"` $STARPU_FPGA_CPPFLAGS"
			
 
				    	   STARPU_FPGA_LDFLAGS="`slic-config --libs | sed s/\'//g | sed "s/-L /-L/" | sed "s/-L /-L/"`"
			
 
				 	else
			
 
				-   	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lrt -lm"	
			
 
				+   	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lrt -lm"
			
 
				 	fi
			
 
				 
			
 
				 	CPPFLAGS="${CPPFLAGS} ${STARPU_FPGA_CPPFLAGS} "
			
@@ -185,19 +173,32 @@ if test x$enable_fpga = xyes; then
 
				 			[have_valid_fpga="yes"],
			
 
				 			[have_valid_fpga="no"]
			
 
				 			)
			
 
				-		AC_MSG_CHECKING(whether Fpga is working)
			
 
				-		if test x$have_valid_fpga = xyes; then
			
 
				-	   	   	AC_MSG_RESULT([:-)])
			
 
				-	   	else
			
 
				-			AC_MSG_RESULT([;(])
			
 
				-		fi
			
 
				-
			
 
				-		LDFLAGS="${SAVED_LDFLAGS}"
			
 
				-		CPPFLAGS="${SAVED_CPPFLAGS}"
			
 
				+		AC_MSG_CHECKING(whether FPGA is working)
			
 
				+		AC_MSG_RESULT($have_valid_fpga)
			
 
				 
			
 
				 		AC_SUBST(STARPU_FPGA_CPPFLAGS)
			
 
				 		AC_SUBST(STARPU_FPGA_LDFLAGS)
			
 
				+	else
			
 
				+		AC_MSG_CHECKING(whether FPGA is installed)
			
 
				+		AC_MSG_RESULT([no])
			
 
				 	fi
			
 
				+	LDFLAGS="${SAVED_LDFLAGS}"
			
 
				+	CPPFLAGS="${SAVED_CPPFLAGS}"
			
 
				+
			
 
				+	# in case FPGA was explicitely required, but is not available, this is an error
			
 
				+	if test x$enable_fpga = xyes -a x$have_valid_fpga = xno; then
			
 
				+		AC_MSG_ERROR([cannot find FPGA])
			
 
				+    	fi
			
 
				+	# now we enable FPGA if and only if a proper setup is available
			
 
				+	enable_fpga=$have_valid_fpga
			
 
				+fi
			
 
				+
			
 
				+AC_MSG_CHECKING(whether FPGA should be used)
			
 
				+AC_MSG_RESULT($enable_fpga)
			
 
				+AC_SUBST(STARPU_USE_FPGA,$enable_fpga)
			
 
				+AM_CONDITIONAL(STARPU_USE_FPGA,test x$enable_fpga = xyes)
			
 
				+if test x$enable_fpga = xyes; then
			
 
				+   	AC_DEFINE(STARPU_USE_FPGA,[1],[FPGA support is activated])
			
 
				 fi
			
 
				 
			
 
				 
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -106,7 +106,6 @@ chapters =	\
 
				 	chapters/code/disk_compute.c \
			
 
				 	chapters/code/nf_initexit.f90 \
			
 
				 	chapters/api/fft_support.doxy \
			
 
				-	chapters/api/fpga_extensions.doxy \
			
 
				 	chapters/api/versioning.doxy \
			
 
				 	chapters/api/threads.doxy
			
 
				 
			
--- a/doc/doxygen/chapters/440_fpga_support.doxy
+++ b/doc/doxygen/chapters/440_fpga_support.doxy
@@ -17,15 +17,34 @@
 
				 /*! \page FPGASupport FPGA Support
			
 
				 
			
 
				 \section Introduction Introduction
			
 
				-Maxeler provides hardware and software solutions for accelerating computing applications on dataflow engines (DFEs). DFEs are in-house designed accelerators that encapsulate reconfigurable high-end FPGAs at their core and are equipped with large amounts of DDR memory.
			
 
				-We extend the StarPU task programming library that initially targets heterogeneous architectures to support Field Programmable Gate Array (FPGA). 
			
 
				-To create <c>StarPU/FPGA</c> applications exploiting DFE configurations, MaxCompiler allows an application to be split into three parts:
			
 
				 
			
 
				-- <c>Kernel</c>, which implements the computational components of the application in hardware.
			
 
				-- <c>Manager configuration</c>, which connects Kernels to the CPU, engine RAM, other Kernels and other DFEs via MaxRing.
			
 
				-- <c>CPU application</c>, which interacts with the DFEs to read and write data to the Kernels and engine RAM.
			
 
				-
			
 
				-The Simple Live CPU interface (SLiC) is Maxeler’s application programming interface for seamless CPU-DFE integration. SLiC allows CPU applications to configure and load a number of DFEs as well as to subsequently schedule and run actions on those DFEs using simple function calls. In StarPU/FPGA applications, we use <c>Dynamic SLiC Interface</c> to exchange data streams between the CPU (Main Memory) and DFE (Local Memory).
			
 
				+Maxeler provides hardware and software solutions for accelerating
			
 
				+computing applications on dataflow engines (DFEs). DFEs are in-house
			
 
				+designed accelerators that encapsulate reconfigurable high-end FPGAs
			
 
				+at their core and are equipped with large amounts of DDR memory.
			
 
				+
			
 
				+We extend the StarPU task programming library that initially targets
			
 
				+heterogeneous architectures to support Field Programmable Gate Array
			
 
				+(FPGA).
			
 
				+
			
 
				+To create <c>StarPU/FPGA</c> applications exploiting DFE
			
 
				+configurations, MaxCompiler allows an application to be split into
			
 
				+three parts:
			
 
				+
			
 
				+- <c>Kernel</c>, which implements the computational components of the
			
 
				+  application in hardware.
			
 
				+- <c>Manager configuration</c>, which connects Kernels to the CPU,
			
 
				+  engine RAM, other Kernels and other DFEs via MaxRing.
			
 
				+- <c>CPU application</c>, which interacts with the DFEs to read and
			
 
				+  write data to the Kernels and engine RAM.
			
 
				+
			
 
				+The Simple Live CPU interface (SLiC) is Maxeler’s application
			
 
				+programming interface for seamless CPU-DFE integration. SLiC allows
			
 
				+CPU applications to configure and load a number of DFEs as well as to
			
 
				+subsequently schedule and run actions on those DFEs using simple
			
 
				+function calls. In StarPU/FPGA applications, we use <em>Dynamic SLiC
			
 
				+Interface</em> to exchange data streams between the CPU (Main Memory)
			
 
				+and DFE (Local Memory).
			
 
				 
			
 
				 \section PortingApplicationsToFPGA Porting Applications to FPGA
			
 
				 
			
@@ -43,12 +62,22 @@ struct starpu_codelet cl =
 
				 
			
 
				 \subsection FPGAExample StarPU/FPGA Application
			
 
				 
			
 
				-To give you an idea of the interface that we used to exchange data between <c>host</c> (CPU) and <c>FPGA</c> (DFE), here is an example, based on one of the examples of Maxeler (https://trac.version.fz-juelich.de/reconfigurable/wiki/Public). 
			
 
				-<c>StreamFMAKernel.maxj</c> represents the Java kernel code; it implements a very simple kernel (c=a+b), and <c>Test.c</c> starts it from the <c>fpga_add</c> function; it first sets streaming up from the CPU pointers, triggers execution and waits for the result. The API to interact with DFEs is called <c>SLiC</c> which then also involves the <c> MaxelerOS</c> runtime.
			
 
				+To give you an idea of the interface that we used to exchange data
			
 
				+between <c>host</c> (CPU) and <c>FPGA</c> (DFE), here is an example,
			
 
				+based on one of the examples of Maxeler
			
 
				+(https://trac.version.fz-juelich.de/reconfigurable/wiki/Public).
			
 
				 
			
 
				+<c>StreamFMAKernel.maxj</c> represents the Java kernel code; it
			
 
				+implements a very simple kernel (<c>c=a+b</c>), and <c>Test.c</c> starts it
			
 
				+from the <c>fpga_add</c> function; it first sets streaming up from the
			
 
				+CPU pointers, triggers execution and waits for the result. The API to
			
 
				+interact with DFEs is called <em>SLiC</em> which then also involves the
			
 
				+<c>MaxelerOS</c> runtime.
			
 
				 
			
 
				-- <c>StreamFMAKernel.maxj</c>: the DFE part is described in the MaxJ programming language which is a Java-based metaprogramming approach.
			
 
				-\code{.c}
			
 
				+- <c>StreamFMAKernel.maxj</c>: the DFE part is described in the MaxJ
			
 
				+  programming language which is a Java-based metaprogramming approach.
			
 
				+
			
 
				+\code{.java}
			
 
				 package tests;
			
 
				 
			
 
				 import com.maxeler.maxcompiler.v2.kernelcompiler.Kernel;
			
@@ -56,11 +85,13 @@ import com.maxeler.maxcompiler.v2.kernelcompiler.KernelParameters;
 
				 import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEType;
			
 
				 import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEVar;
			
 
				 
			
 
				-class StreamFMAKernel extends Kernel {
			
 
				+class StreamFMAKernel extends Kernel
			
 
				+{
			
 
				 
			
 
				    private static final DFEType type = dfeInt(32);
			
 
				 
			
 
				-   protected StreamFMAKernel(KernelParameters parameters) {
			
 
				+   protected StreamFMAKernel(KernelParameters parameters)
			
 
				+   {
			
 
				              super(parameters);
			
 
				 
			
 
				 	     DFEVar a = io.input("a", type);
			
@@ -70,25 +101,27 @@ class StreamFMAKernel extends Kernel {
 
				 	     c = a+b;
			
 
				 
			
 
				 	     io.output("output", c, type);
			
 
				-	}
			
 
				-
			
 
				+   }
			
 
				 }
			
 
				-
			
 
				 \endcode
			
 
				 
			
 
				-- <c>StreamFMAManager.maxj</c>: is also described in the MaxJ programming language and orchestrates data movement between the host and the DFE.
			
 
				-\code{.c}
			
 
				+- <c>StreamFMAManager.maxj</c>: is also described in the MaxJ
			
 
				+  programming language and orchestrates data movement between the host
			
 
				+  and the DFE.
			
 
				+
			
 
				+\code{.java}
			
 
				 package tests;
			
 
				 
			
 
				 import com.maxeler.maxcompiler.v2.build.EngineParameters;
			
 
				 import com.maxeler.maxcompiler.v2.managers.custom.blocks.KernelBlock;
			
 
				 import com.maxeler.platform.max5.manager.Max5LimaManager;
			
 
				 
			
 
				-class StreamFMAManager extends Max5LimaManager {
			
 
				-
			
 
				+class StreamFMAManager extends Max5LimaManager
			
 
				+{
			
 
				 	private static final String kernel_name = "StreamFMAKernel";
			
 
				 
			
 
				-	public StreamFMAManager(EngineParameters arg0) {
			
 
				+	public StreamFMAManager(EngineParameters arg0)
			
 
				+	{
			
 
				 		super(arg0);
			
 
				 		KernelBlock kernel = addKernel(new StreamFMAKernel(makeKernelParameters(kernel_name)));
			
 
				 		kernel.getInput("a") <== addStreamFromCPU("a");
			
@@ -96,41 +129,54 @@ class StreamFMAManager extends Max5LimaManager {
 
				 		addStreamToCPU("output") <== kernel.getOutput("output");
			
 
				 	}
			
 
				 
			
 
				-	public static void main(String[] args) {
			
 
				+	public static void main(String[] args)
			
 
				+	{
			
 
				 		StreamFMAManager manager = new StreamFMAManager(new EngineParameters(args));
			
 
				 		manager.build();
			
 
				 	}
			
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-Once <c>StreamFMAKernel.maxj</c> and <c>StreamFMAManager.maxj</c> are written, there are other steps to do:
			
 
				+Once <c>StreamFMAKernel.maxj</c> and <c>StreamFMAManager.maxj</c> are
			
 
				+written, there are other steps to do:
			
 
				 
			
 
				 - Building the JAVA program: (for Kernel and Manager (.maxj))
			
 
				 \verbatim
			
 
				 $ maxjc -1.7 -cp $MAXCLASSPATH streamfma/
			
 
				 \endverbatim
			
 
				 
			
 
				-- Running the Java program to generate a DFE implementation (a .max file) that can be called from a StarPU/FPGA application and slic headers (.h) for simulation:
			
 
				+- Running the Java program to generate a DFE implementation (a .max
			
 
				+  file) that can be called from a StarPU/FPGA application and slic
			
 
				+  headers (.h) for simulation:
			
 
				+
			
 
				 \verbatim
			
 
				 $ java -XX:+UseSerialGC -Xmx2048m -cp $MAXCLASSPATH:. streamfma.StreamFMAManager DFEModel=MAIA maxFileName=StreamFMA target=DFE_SIM
			
 
				 \endverbatim
			
 
				 
			
 
				-- Build the slic object file (simulation): 
			
 
				+- Build the slic object file (simulation):
			
 
				+
			
 
				 \verbatim
			
 
				 $ sliccompile StreamFMA.max
			
 
				 \endverbatim
			
 
				 
			
 
				 - <c>Test.c </c>:
			
 
				-to interface StarPU task-based runtime system with Maxeler's DFE devices, we use the advanced dynamic interface of <c>SLiC</c> in <b>non_blocking</b> mode.  
			
 
				-Test code must include <c>MaxSLiCInterface.h</c> and <c>MaxFile.h</c>. The .max file contains the bitstream. The StarPU/FPGA application can be written in C, C++, etc.
			
 
				+
			
 
				+to interface StarPU task-based runtime system with Maxeler's DFE
			
 
				+devices, we use the advanced dynamic interface of <em>SLiC</em> in
			
 
				+<b>non_blocking</b> mode.
			
 
				+
			
 
				+Test code must include <c>MaxSLiCInterface.h</c> and <c>MaxFile.h</c>.
			
 
				+The .max file contains the bitstream. The StarPU/FPGA application can
			
 
				+be written in C, C++, etc.
			
 
				+
			
 
				 \code{.c}
			
 
				 #include "StreamFMA.h"
			
 
				 #include "MaxSLiCInterface.h"
			
 
				 
			
 
				 void fpga_add(void *buffers[], void *cl_arg)
			
 
				-{   
			
 
				+{
			
 
				     (void)cl_arg;
			
 
				-    
			
 
				+
			
 
				     int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				     int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
			
 
				     int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
			
@@ -142,11 +188,11 @@ void fpga_add(void *buffers[], void *cl_arg)
 
				 
			
 
				     /* set the number of ticks for a kernel */
			
 
				     max_set_ticks  (act, "StreamFMAKernel", size);
			
 
				-    
			
 
				+
			
 
				     /* send input streams */
			
 
				-    max_queue_input(act, "a", a, size *sizeof(a[0])); 
			
 
				+    max_queue_input(act, "a", a, size *sizeof(a[0]));
			
 
				     max_queue_input(act, "b", b, size*sizeof(b[0]));
			
 
				-    
			
 
				+
			
 
				     /* store output stream */
			
 
				     max_queue_output(act,"output", c, size*sizeof(c[0]));
			
 
				 
			
@@ -158,7 +204,6 @@ void fpga_add(void *buffers[], void *cl_arg)
 
				 
			
 
				     printf("*** wait for the actions on DFE to complete *** \n");
			
 
				     max_wait(run0);
			
 
				-     
			
 
				   }
			
 
				 
			
 
				   static struct starpu_codelet cl =
			
@@ -172,14 +217,13 @@ void fpga_add(void *buffers[], void *cl_arg)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				- 
			
 
				     ...
			
 
				 
			
 
				     /* Implementation of a maxfile */
			
 
				     max_file_t *maxfile = StreamFMA_init();
			
 
				 
			
 
				     /* Implementation of an engine */
			
 
				-    max_engine_t *engine = max_load(maxfile, "*"); 
			
 
				+    max_engine_t *engine = max_load(maxfile, "*");
			
 
				 
			
 
				     starpu_init(NULL);
			
 
				 
			
@@ -192,19 +236,26 @@ int main(int argc, char **argv)
 
				 
			
 
				     /* unload and deallocate an engine obtained by way of max_load */
			
 
				     max_unload(engine);
			
 
				-    
			
 
				+
			
 
				     return 0;
			
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-To write the StarPU/FPGA application: first, the programmer must describe the codelet using StarPU’s C API. This codelet provides both a CPU implementation and an FPGA one. It also specifies that the task has two inputs and one output through the <c>nbuffers</c> and <c>modes</c> attributes.
			
 
				+To write the StarPU/FPGA application: first, the programmer must
			
 
				+describe the codelet using StarPU’s C API. This codelet provides both
			
 
				+a CPU implementation and an FPGA one. It also specifies that the task
			
 
				+has two inputs and one output through the starpu_codelet::nbuffers and
			
 
				+starpu_codelet::modes attributes.
			
 
				 
			
 
				-<c>fpga_add</c> function is the name of the FPGA implementation and is mainly divided in four steps:
			
 
				+<c>fpga_add</c> function is the name of the FPGA implementation and is
			
 
				+mainly divided in four steps:
			
 
				 
			
 
				 - Init actions to be run on DFE.
			
 
				 - Add data to an input stream for an action.
			
 
				 - Add data storage space for an output stream.
			
 
				-- Run actions on DFE in <b>non_blocking</b> mode; a non-blocking call returns immediately, allowing the calling code to do more CPU work in parallel while the actions are run.
			
 
				+- Run actions on DFE in <b>non_blocking</b> mode; a non-blocking call
			
 
				+  returns immediately, allowing the calling code to do more CPU work
			
 
				+  in parallel while the actions are run.
			
 
				 - Wait for the actions to complete.
			
 
				 
			
 
				 In the <c>main</c> function, there are four important steps:
			
@@ -214,31 +265,46 @@ In the <c>main</c> function, there are four important steps:
 
				 - Free actions.
			
 
				 - Unload and deallocate the DFE.
			
 
				 
			
 
				-The rest of the application (data registration, task submission, etc.) is as usual with StarPU
			
 
				+The rest of the application (data registration, task submission, etc.)
			
 
				+is as usual with StarPU.
			
 
				 
			
 
				 \subsection FPGADataTransfers Data Transfers in StarPU/FPGA Applications
			
 
				 
			
 
				-The communication between the host and the DFE is done through the <c>Dynamic advance interface</c> to exchange data between the main memory and the local memory of the DFE.
			
 
				-For instant, we use \ref STARPU_MAIN_RAM to send and store data to/from DFE's local memory. However, we aim to use a multiplexer to choose which memory node we will use to read/write data. So, the user can tell that the computational kernel will take data from the main memory or DFE's local memory for example.
			
 
				+The communication between the host and the DFE is done through the
			
 
				+<em>Dynamic advance interface</em> to exchange data between the main
			
 
				+memory and the local memory of the DFE.
			
 
				+
			
 
				+For the moment, we use \ref STARPU_MAIN_RAM to send and store data
			
 
				+to/from DFE's local memory. However, we aim to use a multiplexer to
			
 
				+choose which memory node we will use to read/write data. So, the user
			
 
				+can tell that the computational kernel will take data from the main
			
 
				+memory or DFE's local memory for example.
			
 
				 
			
 
				-In starPU applications, When \ref starpu_codelet::specific_nodes is 1, this specifies the memory nodes where each data should be sent to for task execution.
			
 
				-  
			
 
				+In StarPU applications, when \ref starpu_codelet::specific_nodes is
			
 
				+set to 1, this specifies the memory nodes where each data should be
			
 
				+sent to for task execution.
			
 
				 
			
 
				 \subsection FPGAConfiguration FPGA Configuration
			
 
				 
			
 
				-To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c> through the \c configure option <b>"--with-fpga"</b>.
			
 
				+To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c>
			
 
				+through the \c configure option \ref with-fpga "--with-fpga".
			
 
				+
			
 
				+Compiling and installing StarPU/FPGA application is done following the
			
 
				+standard procedure:
			
 
				 
			
 
				-Compiling and installing StarPU/FPGA application is done following the standard procedure:
			
 
				 \verbatim
			
 
				 $ make
			
 
				 $ make install
			
 
				 \endverbatim
			
 
				 
			
 
				-
			
 
				 \subsection FPGALaunchingprograms  Launching Programs: Simulation
			
 
				 
			
 
				-Maxeler provides a simple tutorial to use MaxCompiler (https://trac.version.fz-juelich.de/reconfigurable/wiki/Public). Running the Java program to generate maxfile and slic headers (hardware) on Maxeler's DFE device, takes a VERY long time, approx. 2 hours even for this very small example. That's why we use the simulation.  
			
 
				-
			
 
				+Maxeler provides a simple tutorial to use MaxCompiler
			
 
				+(https://trac.version.fz-juelich.de/reconfigurable/wiki/Public).
			
 
				+Running the Java program to generate maxfile and slic headers
			
 
				+(hardware) on Maxeler's DFE device, takes a VERY long time, approx. 2
			
 
				+hours even for this very small example. That's why we use the
			
 
				+simulation.
			
 
				 
			
 
				 - To start the simulation on Maxeler's DFE device:
			
 
				 \verbatim
			
@@ -256,8 +322,8 @@ cores by setting the \ref STARPU_NCPU environment variable to 0.
 
				 \verbatim
			
 
				 $ STARPU_NCPU=0 ./StreamFMA
			
 
				 \endverbatim
			
 
				- 
			
 
				-- To stop the simulation 
			
 
				+
			
 
				+- To stop the simulation
			
 
				 \verbatim
			
 
				 $ maxcompilersim -c LIMA -n StreamFMA stop
			
 
				 \endverbatim
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -370,6 +370,14 @@ the macro ::STARPU_MAXNODES. Reducing it allows to considerably reduce memory
 
				 used by StarPU data structures.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--with-fpga=<c>dir</c></dt>
			
 
				+<dd>
			
 
				+\anchor with-fpga
			
 
				+\addindex __configure__--with-fpga
			
 
				+Enable the FPGA driver support, and optionally specify the location of
			
 
				+the FPGA library.
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section ExtensionConfiguration Extension Configuration
			
--- a/doc/doxygen/chapters/api/fpga_extensions.doxy
+++ b/doc/doxygen/chapters/api/fpga_extensions.doxy
@@ -1,28 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*! \ingroup API_FPGA_Extensions
			
 
				-
			
 
				-\def STARPU_USE_FPGA
			
 
				-\ingroup API_FPGA_Extensions
			
 
				-Defined when StarPU has been installed with FPGA support.
			
 
				-It should be used in your code to detect the availability of FPGA.
			
 
				-
			
 
				-\def STARPU_MAXFPGADEVS
			
 
				-\ingroup API_FPGA_Extensions
			
 
				-Define the maximum number of FPGA devices that are supported by StarPU.
			
 
				-
			
 
				-*/
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -538,11 +538,6 @@ nobase_STARPU_OPENCL_DATA_DATA += 		\
 
				 	basic_examples/block_opencl_kernel.cl
			
 
				 endif
			
 
				 
			
 
				-if STARPU_USE_FPGA
			
 
				-basic_examples_mmult_SOURCES =                                    \
			
 
				-	basic_examples/mult-fpga.c
			
 
				-endif
			
 
				-
			
 
				 ####################
			
 
				 # Variable example #
			
 
				 ####################
			
--- a/examples/basic_examples/mult-fpga.c
+++ b/examples/basic_examples/mult-fpga.c
@@ -1,396 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				- * Copyright (C) 2010       Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * This example shows a simple implementation of a blocked matrix
			
 
				- * multiplication. Note that this is NOT intended to be an efficient
			
 
				- * implementation of sgemm! In this example, we show:
			
 
				- *  - how to declare dense matrices (starpu_matrix_data_register)
			
 
				- *  - how to manipulate matrices within codelets (eg. descr[0].blas.ld)
			
 
				- *  - how to use filters to partition the matrices into blocks
			
 
				- *    (starpu_data_partition and starpu_data_map_filters)
			
 
				- *  - how to unpartition data (starpu_data_unpartition) and how to stop
			
 
				- *    monitoring data (starpu_data_unregister)
			
 
				- *  - how to manipulate subsets of data (starpu_data_get_sub_data)
			
 
				- *  - how to construct an autocalibrated performance model (starpu_perfmodel)
			
 
				- *  - how to submit asynchronous tasks
			
 
				- */
			
 
				-
			
 
				-#include <string.h>
			
 
				-#include <math.h>
			
 
				-#include <sys/types.h>
			
 
				-#include <signal.h>
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-static float *A, *B, *C;
			
 
				-static starpu_data_handle_t A_handle, B_handle, C_handle;
			
 
				-
			
 
				-static unsigned nslicesx = 4;
			
 
				-static unsigned nslicesy = 4;
			
 
				-#ifdef STARPU_QUICK_CHECK
			
 
				-static unsigned xdim = 512;
			
 
				-static unsigned ydim = 512;
			
 
				-static unsigned zdim = 256;
			
 
				-#else
			
 
				-static unsigned xdim = 1024;
			
 
				-static unsigned ydim = 1024;
			
 
				-static unsigned zdim = 512;
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				-/*
			
 
				- * That program should compute C = A * B 
			
 
				- * 
			
 
				- *   A of size (z,y)
			
 
				- *   B of size (x,z)
			
 
				- *   C of size (x,y)
			
 
				-
			
 
				-              |---------------|
			
 
				-            z |       B       |
			
 
				-              |---------------|
			
 
				-       z              x
			
 
				-     |----|   |---------------|
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     | A  | y |       C       |
			
 
				-     |    |   |               |
			
 
				-     |    |   |               |
			
 
				-     |----|   |---------------|
			
 
				-
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * The codelet is passed 3 matrices, the "descr" union-type field gives a
			
 
				- * description of the layout of those 3 matrices in the local memory (ie. RAM
			
 
				- * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
			
 
				- * registered data with the "matrix" data interface, we use the matrix macros.
			
 
				- */
			
 
				-
			
 
				-void cpu_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED  void *arg)
			
 
				-{
			
 
				-	float *subA, *subB, *subC;
			
 
				-	uint32_t nxC, nyC, nyA;
			
 
				-	uint32_t ldA, ldB, ldC;
			
 
				-
			
 
				-	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				-	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				-	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				-
			
 
				-	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
			
 
				-	 * is the number of lines that are separated by .blas.ld elements (ld
			
 
				-	 * stands for leading dimension).
			
 
				-	 * NB: in case some filters were used, the leading dimension is not
			
 
				-	 * guaranteed to be the same in main memory (on the original matrix)
			
 
				-	 * and on the accelerator! */
			
 
				-	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				-	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				-	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				-
			
 
				-	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				-	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				-
			
 
				-	/* we assume a FORTRAN-ordering! */
			
 
				-	unsigned i,j,k;
			
 
				-	for (i = 0; i < nyC; i++)
			
 
				-	{
			
 
				-		for (j = 0; j < nxC; j++)
			
 
				-		{
			
 
				-			float sum = 0.0;
			
 
				-
			
 
				-			for (k = 0; k < nyA; k++)
			
 
				-			{
			
 
				-				sum += subA[j+k*ldA]*subB[k+i*ldB];
			
 
				-			}
			
 
				-
			
 
				-			subC[j + i*ldC] = sum;
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void init_problem_data(void)
			
 
				-{
			
 
				-	unsigned i,j;
			
 
				-
			
 
				-	/* we initialize matrices A, B and C in the usual way */
			
 
				-
			
 
				-	A = (float *) malloc(zdim*ydim*sizeof(float));
			
 
				-	B = (float *) malloc(xdim*zdim*sizeof(float));
			
 
				-	C = (float *) malloc(xdim*ydim*sizeof(float));
			
 
				-
			
 
				-	/* fill the A and B matrices */
			
 
				-	srand(2009);
			
 
				-	for (j=0; j < ydim; j++)
			
 
				-	{
			
 
				-		for (i=0; i < zdim; i++)
			
 
				-		{
			
 
				-			A[j+i*ydim] = (float)(starpu_drand48());
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (j=0; j < zdim; j++)
			
 
				-	{
			
 
				-		for (i=0; i < xdim; i++)
			
 
				-		{
			
 
				-			B[j+i*zdim] = (float)(starpu_drand48());
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (j=0; j < ydim; j++)
			
 
				-	{
			
 
				-		for (i=0; i < xdim; i++)
			
 
				-		{
			
 
				-			C[j+i*ydim] = (float)(0);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void partition_mult_data(void)
			
 
				-{
			
 
				-	/* note that we assume a FORTRAN ordering here! */
			
 
				-
			
 
				-	/* The BLAS data interface is described by 4 parameters: 
			
 
				-	 *  - the location of the first element of the matrix to monitor (3rd
			
 
				-	 *    argument)
			
 
				-	 *  - the number of elements between columns, aka leading dimension
			
 
				-	 *    (4th arg)
			
 
				-	 *  - the number of (contiguous) elements per column, ie. contiguous
			
 
				-	 *  elements (5th arg)
			
 
				-	 *  - the number of columns (6th arg)
			
 
				-	 * The first elements is a pointer to the data_handle that will be
			
 
				-	 * associated to the matrix, and the second elements gives the memory
			
 
				-	 * node in which resides the matrix: 0 means that the 3rd argument is
			
 
				-	 * an adress in main memory.
			
 
				-	 */
			
 
				-	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, 
			
 
				-		ydim, ydim, zdim, sizeof(float));
			
 
				-	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B, 
			
 
				-		zdim, zdim, xdim, sizeof(float));
			
 
				-	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C, 
			
 
				-		ydim, ydim, xdim, sizeof(float));
			
 
				-
			
 
				-	/* A filter is a method to partition a data into disjoint chunks, it is
			
 
				-	 * described by the means of the "struct starpu_data_filter" structure that
			
 
				-	 * contains a function that is applied on a data handle to partition it
			
 
				-	 * into smaller chunks, and an argument that is passed to the function
			
 
				-	 * (eg. the number of blocks to create here).
			
 
				-	 */
			
 
				-
			
 
				-	/* StarPU supplies some basic filters such as the partition of a matrix
			
 
				-	 * into blocks, note that we are using a FORTRAN ordering so that the
			
 
				-	 * name of the filters are a bit misleading */
			
 
				-	struct starpu_data_filter vert =
			
 
				-	{
			
 
				-		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				-		.nchildren = nslicesx
			
 
				-	};
			
 
				-
			
 
				-	struct starpu_data_filter horiz =
			
 
				-	{
			
 
				-		.filter_func = starpu_matrix_filter_block,
			
 
				-		.nchildren = nslicesy
			
 
				-	};
			
 
				-
			
 
				-/*
			
 
				- *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
			
 
				- *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
			
 
				- *	the number of filters to apply, and the indexes for each filters, for
			
 
				- *	instance:
			
 
				- *
			
 
				- *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1); 
			
 
				- *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2); 
			
 
				- *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1); 
			
 
				- *
			
 
				- *	Note that here we applied 2 filters recursively onto C.
			
 
				- *
			
 
				- *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
			
 
				- *	of blocked matrix C for example.
			
 
				- *
			
 
				- *		              |---|---|---|---|
			
 
				- *		              |   |   | B'|   | B
			
 
				- *		              |---|---|---|---|
			
 
				- *		                0   1   2   3
			
 
				- *		     |----|   |---|---|---|---|
			
 
				- *		     |    |   |   |   |   |   |
			
 
				- *		     |    | 0 |   |   |   |   |
			
 
				- *		     |----|   |---|---|---|---|
			
 
				- *		     | A' |   |   |   | C'|   |
			
 
				- *		     |    |   |   |   |   |   |
			
 
				- *		     |----|   |---|---|---|---|
			
 
				- *		       A              C
			
 
				- *
			
 
				- *	IMPORTANT: applying filters is equivalent to partitionning a piece of
			
 
				- *	data in a hierarchical manner, so that memory consistency is enforced
			
 
				- *	for each of the elements independantly. The tasks should therefore NOT
			
 
				- *	access inner nodes (eg. one column of C or the whole C) but only the
			
 
				- *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
			
 
				- *	possible by disapplying the filters (using starpu_data_unpartition), to
			
 
				- *	enforce memory consistency.
			
 
				- */
			
 
				-
			
 
				-	starpu_data_partition(B_handle, &vert);
			
 
				-	starpu_data_partition(A_handle, &horiz);
			
 
				-
			
 
				-	/* starpu_data_map_filters is a variable-arity function, the first argument
			
 
				-	 * is the handle of the data to partition, the second argument is the
			
 
				-	 * number of filters to apply recursively. Filters are applied in the
			
 
				-	 * same order as the arguments.
			
 
				-	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
			
 
				-	 * then applying horiz on each sub-data (ie. each column of C)
			
 
				-	 */
			
 
				-	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				-}
			
 
				-
			
 
				-static struct starpu_perfmodel mult_perf_model =
			
 
				-{
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "mult_perf_model"
			
 
				-};
			
 
				-
			
 
				-static struct starpu_codelet cl =
			
 
				-{
			
 
				-        /* we can only execute that kernel on a CPU yet */
			
 
				-        /* CPU implementation of the codelet */
			
 
				-        .cpu_funcs = {cpu_mult},
			
 
				-        .cpu_funcs_name = {"cpu_mult"},
			
 
				-        /* the codelet manipulates 3 buffers that are managed by the
			
 
				-         * DSM */
			
 
				-        .nbuffers = 3,
			
 
				-	.modes = {STARPU_R, STARPU_R, STARPU_W},
			
 
				-        /* in case the scheduling policy may use performance models */
			
 
				-        .model = &mult_perf_model
			
 
				-};
			
 
				-
			
 
				-static int launch_tasks(void)
			
 
				-{
			
 
				-	int ret;
			
 
				-	/* partition the work into slices */
			
 
				-	unsigned taskx, tasky;
			
 
				-
			
 
				-	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				-	{
			
 
				-		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				-		{
			
 
				-			/* C[taskx, tasky] = A[tasky] B[taskx] */
			
 
				-
			
 
				-			/* by default, starpu_task_create() returns an
			
 
				- 			 * asynchronous task (ie. task->synchronous = 0) */
			
 
				-			struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-			/* this task implements codelet "cl" */
			
 
				-			task->cl = &cl;
			
 
				-
			
 
				-			/*
			
 
				-			 *              |---|---|---|---|
			
 
				-			 *              |   | * |   |   | B
			
 
				-			 *              |---|---|---|---|
			
 
				-			 *                    X 
			
 
				-			 *     |----|   |---|---|---|---|
			
 
				-			 *     |****| Y |   |***|   |   |
			
 
				-			 *     |****|   |   |***|   |   |
			
 
				-			 *     |----|   |---|---|---|---|
			
 
				-			 *     |    |   |   |   |   |   |
			
 
				-			 *     |    |   |   |   |   |   |
			
 
				-			 *     |----|   |---|---|---|---|
			
 
				-			 *       A              C
			
 
				-			 */
			
 
				-
			
 
				-			/* there was a single filter applied to matrices A
			
 
				-			 * (respectively B) so we grab the handle to the chunk
			
 
				-			 * identified by "tasky" (respectively "taskx). The "1"
			
 
				-			 * tells StarPU that there is a single argument to the
			
 
				-			 * variable-arity function starpu_data_get_sub_data */
			
 
				-			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				-			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				-
			
 
				-			/* 2 filters were applied on matrix C, so we give
			
 
				-			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
			
 
				-			 * must match the order in which the filters were
			
 
				-			 * applied.
			
 
				-			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
			
 
				-			 * a handle to the column number k of matrix C.
			
 
				-			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
			
 
				-			 * equivalent to
			
 
				-			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
			
 
				-			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				-
			
 
				-			/* this is not a blocking call since task->synchronous = 0 */
			
 
				-			ret = starpu_task_submit(task);
			
 
				-			if (ret == -ENODEV) return ret;
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-		}
			
 
				-	}
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-int main(STARPU_ATTRIBUTE_UNUSED int argc, 
			
 
				-	 STARPU_ATTRIBUTE_UNUSED char **argv)
			
 
				-{
			
 
				-	int ret;
			
 
				-
			
 
				-	/* start the runtime */
			
 
				-	ret = starpu_init(NULL);
			
 
				-	if (ret == -ENODEV)
			
 
				-		return 77;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	/* initialize matrices A, B and C and register them to StarPU */
			
 
				-	init_problem_data();
			
 
				-
			
 
				-	/* partition matrices into blocks that can be manipulated by the
			
 
				- 	 * codelets */
			
 
				-	partition_mult_data();
			
 
				-
			
 
				-	/* submit all tasks in an asynchronous fashion */
			
 
				-	ret = launch_tasks();
			
 
				-	if (ret == -ENODEV) goto enodev;
			
 
				-
			
 
				-	/* wait for termination */
			
 
				-        starpu_task_wait_for_all();
			
 
				-
			
 
				-	/* remove the filters applied by the means of starpu_data_map_filters; now
			
 
				- 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
			
 
				-	 * starpu_data_map_filters is called again on C_handle.
			
 
				-	 * The second argument is the memory node where the different subsets
			
 
				-	 * should be reassembled, 0 = main memory (RAM) */
			
 
				-	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
			
 
				-	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
			
 
				-	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
			
 
				-
			
 
				-	/* stop monitoring matrix C : after this, it is not possible to pass C 
			
 
				-	 * (or any subset of C) as a codelet input/output. This also implements
			
 
				-	 * a barrier so that the piece of data is put back into main memory in
			
 
				-	 * case it was only available on a GPU for instance. */
			
 
				-	starpu_data_unregister(A_handle);
			
 
				-	starpu_data_unregister(B_handle);
			
 
				-	starpu_data_unregister(C_handle);
			
 
				-
			
 
				-	free(A);
			
 
				-	free(B);
			
 
				-	free(C);
			
 
				-
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-
			
 
				-enodev:
			
 
				-	starpu_shutdown();
			
 
				-	return 77;
			
 
				-}
			
 
				-
			
--- a/examples/mlr/mlr.c
+++ b/examples/mlr/mlr.c
@@ -185,7 +185,7 @@ int main(void)
 
				 		vector_mn[1] = n;
			
 
				 		starpu_data_release(vector_mn_handle);
			
 
				 
			
 
				-		for (j = 0; j < 42; j++)
			
 
				+		for (j = 0; j < 1000; j++)
			
 
				 		{
			
 
				 			starpu_insert_task(&cl_init,
			
 
				 					   STARPU_R, vector_mn_handle,
			
@@ -202,5 +202,11 @@ int main(void)
 
				 	free(vector_mn);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	starpu_perfmodel_dump_xml(stdout, &cl_model_final);
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
--- a/include/starpu_clusters.h
+++ b/include/starpu_clusters.h
@@ -122,7 +122,7 @@ struct starpu_cluster_machine* starpu_cluster_machine(hwloc_obj_type_t cluster_l
 
				 int starpu_uncluster_machine(struct starpu_cluster_machine* clusters);
			
 
				 int starpu_cluster_print(struct starpu_cluster_machine* clusters);
			
 
				 
			
 
				-/* Prologue functions */
			
 
				+/** Prologue functions */
			
 
				 void starpu_openmp_prologue(void*);
			
 
				 #define starpu_intel_openmp_mkl_prologue starpu_openmp_prologue
			
 
				 #ifdef STARPU_MKL
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -78,6 +78,12 @@
 
				    @ingroup API_MPI_Support
			
 
				 */
			
 
				 #undef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+
			
 
				+/**
			
 
				+   Defined when StarPU has been installed with FPGA support. It should
			
 
				+   be used in your code to detect the availability of FPGA.
			
 
				+   @ingroup API_FPGA_Extensions
			
 
				+*/
			
 
				 #undef STARPU_USE_FPGA
			
 
				 
			
 
				 /**
			
@@ -212,11 +218,16 @@
 
				 #undef STARPU_MAXNUMANODES
			
 
				 
			
 
				 /**
			
 
				- * Define the maximum number of CUDA devices that are supported by StarPU.
			
 
				- * @ingroup API_CUDA_Extensions
			
 
				- */
			
 
				+   Define the maximum number of CUDA devices that are supported by StarPU.
			
 
				+   @ingroup API_CUDA_Extensions
			
 
				+*/
			
 
				 #undef STARPU_MAXCUDADEVS
			
 
				 
			
 
				+/**
			
 
				+   Define the maximum number of FPGA devices that are supported by
			
 
				+   StarPU.
			
 
				+   @ingroup API_FPGA_Extensions
			
 
				+ */
			
 
				 #undef STARPU_MAXFPGADEVS
			
 
				 
			
 
				 /**
			
@@ -307,10 +318,10 @@ typedef ssize_t starpu_ssize_t;
 
				 #undef STARPU_PTHREAD_COND_INITIALIZER_ZERO
			
 
				 #undef STARPU_PTHREAD_RWLOCK_INITIALIZER_ZERO
			
 
				 
			
 
				-/* This is only for building examples */
			
 
				+/** This is only for building examples */
			
 
				 #undef STARPU_HAVE_HELGRIND_H
			
 
				 
			
 
				-/* Enable Fortran to C MPI interface */
			
 
				+/** Enable Fortran to C MPI interface */
			
 
				 #undef  HAVE_MPI_COMM_F2C
			
 
				 
			
 
				 #undef STARPU_HAVE_DARWIN
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -115,7 +115,9 @@ enum starpu_data_access_mode
 
				 
			
 
				 struct starpu_data_interface_ops;
			
 
				 
			
 
				-/** Set the name of the data, to be shown in various profiling tools. */
			
 
				+/**
			
 
				+   Set the name of the data, to be shown in various profiling tools.
			
 
				+*/
			
 
				 void starpu_data_set_name(starpu_data_handle_t handle, const char *name);
			
 
				 
			
 
				 /**
			
--- a/include/starpu_fpga.h
+++ b/include/starpu_fpga.h
@@ -33,37 +33,36 @@ extern "C"
 
				 */
			
 
				 
			
 
				 //int starpu_fpga_allocate_memory(fpga_mem *addr, size_t size);
			
 
				-typedef struct data_exchange_log
			
 
				+
			
 
				+struct starpu_fpga_data_trans
			
 
				 {
			
 
				         int size;
			
 
				         float time;
			
 
				-} fpga_trans;
			
 
				+};
			
 
				 
			
 
				-typedef struct properties
			
 
				+struct starpu_fpga_device_properties
			
 
				 {
			
 
				         int totalGlobalMem;
			
 
				         int concurrentKernels;
			
 
				         char *name;
			
 
				-} fpgaDeviceProp;
			
 
				+};
			
 
				 
			
 
				 /**
			
 
				    get device properties
			
 
				  */
			
 
				+int starpu_fpga_get_device_properties(struct starpu_fpga_device_properties *properties, unsigned devid);
			
 
				 
			
 
				-int fpgaGetDeviceProperties(fpgaDeviceProp *,unsigned devid);
			
 
				-
			
 
				-void fpga_report_configuration(void);
			
 
				+void starpu_fpga_report_configuration(void);
			
 
				 
			
 
				 /**
			
 
				    set fpga device
			
 
				  */
			
 
				-int fpgaSetDevice(unsigned devid);
			
 
				+int starpu_fpga_set_device(unsigned devid);
			
 
				 
			
 
				 /**
			
 
				    fpga is silent
			
 
				  */
			
 
				-
			
 
				-int fpga_is_silent();
			
 
				+int starpu_fpga_is_silent();
			
 
				 
			
 
				 /** @} */
			
 
				 
			
--- a/src/core/perfmodel/multiple_regression.c
+++ b/src/core/perfmodel/multiple_regression.c
@@ -329,7 +329,7 @@ int _starpu_multiple_regression(struct starpu_perfmodel_history_list *ptr, doubl
 
				 		/* Basic validation of the model accuracy */
			
 
				 		starpu_validate_mlr(coeff, ncoeff, codelet_name);
			
 
				 #else
			
 
				-		_STARPU_DISP("Warning: StarPU was compiled with '--disable-mlr' option or on Windows machine, thus multiple linear regression model will not be computed.\n");
			
 
				+		_STARPU_DISP("Warning: StarPU was compiled without '--enable-mlr' option, thus multiple linear regression model will not be computed.\n");
			
 
				 		for(i=0; i<ncoeff; i++)
			
 
				 			coeff[i] = 0.;
			
 
				 #endif //STARPU_MLR_MODEL
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1010,6 +1010,8 @@ static void dump_per_arch_model_xml(FILE *f, struct starpu_perfmodel *model, int
 
				 
			
 
				 void starpu_perfmodel_dump_xml(FILE *f, struct starpu_perfmodel *model)
			
 
				 {
			
 
				+	_starpu_init_and_load_perfmodel(model);
			
 
				+
			
 
				 	fprintf(f, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
			
 
				 	fprintf(f, "<!DOCTYPE StarPUPerfmodel SYSTEM \"starpu-perfmodel.dtd\">\n");
			
 
				 	fprintf(f, "<!-- symbol %s -->\n", model->symbol);
			
@@ -1496,6 +1498,8 @@ int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *mo
 
				 
			
 
				 	if (ret)
			
 
				 		starpu_perfmodel_unload_model(model);
			
 
				+	else
			
 
				+		model->is_loaded = 1;
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -1611,7 +1615,7 @@ docal:
 
				 		char archname[STR_SHORT_LENGTH];
			
 
				 
			
 
				 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements from size %lu to %lu), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, (unsigned long) size, regmodel?regmodel->nsample:0, regmodel?regmodel->minx:0, regmodel?regmodel->maxx:0);
			
 
				+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements from size %lu to %lu), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this. You probably need to run again to continue calibrating the model, until this warning disappears.\n", model->symbol, archname, (unsigned long) size, regmodel?regmodel->nsample:0, regmodel?regmodel->minx:0, regmodel?regmodel->maxx:0);
			
 
				 		_starpu_set_calibrate_flag(1);
			
 
				 		model->benchmarking = 1;
			
 
				 	}
			
@@ -1664,7 +1668,7 @@ docal:
 
				 			char archname[STR_SHORT_LENGTH];
			
 
				 
			
 
				 			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				-			_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, (unsigned long) size, entry && entry->history_entry ? entry->history_entry->nsample : 0);
			
 
				+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this. You probably need to run again to continue calibrating the model, until this warning disappears.\n", model->symbol, archname, (unsigned long) size, entry && entry->history_entry ? entry->history_entry->nsample : 0);
			
 
				 			_starpu_set_calibrate_flag(1);
			
 
				 			model->benchmarking = 1;
			
 
				 		}
			
@@ -1711,7 +1715,7 @@ docal:
 
				 		char archname[STR_SHORT_LENGTH];
			
 
				 
			
 
				 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
			
 
				+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this. You probably need to run again to continue calibrating the model, until this warning disappears.\n", model->symbol, archname);
			
 
				 		_starpu_set_calibrate_flag(1);
			
 
				 		model->benchmarking = 1;
			
 
				 	}
			
@@ -1794,7 +1798,7 @@ docal:
 
				 		char archname[STR_SHORT_LENGTH];
			
 
				 
			
 
				 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
			
 
				-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %ld footprint %x (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, j->task?(long int)_starpu_job_get_data_size(model, arch, nimpl, j):-1, key, entry ? entry->nsample : 0);
			
 
				+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %ld footprint %x (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this. You probably need to run again to continue calibrating the model, until this warning disappears.\n", model->symbol, archname, j->task?(long int)_starpu_job_get_data_size(model, arch, nimpl, j):-1, key, entry ? entry->nsample : 0);
			
 
				 		_starpu_set_calibrate_flag(1);
			
 
				 		model->benchmarking = 1;
			
 
				 	}
			
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -172,6 +172,7 @@ void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmo
 
				 
			
 
				 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output)
			
 
				 {
			
 
				+	_starpu_init_and_load_perfmodel(model);
			
 
				 	if (arch == NULL)
			
 
				 	{
			
 
				 		int comb, impl;
			
--- a/src/drivers/max/driver_fpga.c
+++ b/src/drivers/max/driver_fpga.c
@@ -47,7 +47,7 @@
 
				 //#define STARPU_MAXFPGADEVS 4
			
 
				 /* the number of FPGA devices */
			
 
				 static unsigned  nfpgafpgas = -1;
			
 
				-static fpgaDeviceProp props[STARPU_MAXFPGADEVS];
			
 
				+static struct starpu_fpga_device_properties props[STARPU_MAXFPGADEVS];
			
 
				 static size_t global_mem[STARPU_MAXFPGADEVS] = { 128ULL*1024*1024*1024 };
			
 
				 
			
 
				 static void _starpu_fpga_limit_global_mem(unsigned );
			
@@ -72,9 +72,9 @@ void _starpu_init_fpga()
 
				 #if 0
			
 
				 int fpga_allocate_memory(fpga_mem *ptr, size_t size)
			
 
				 {
			
 
				-//This allocates BYTES
			
 
				+	//This allocates BYTES
			
 
				 	char *msg1="You asked to allocate ";
			
 
				-//	printf(KCYN "%s%d*%d\n" KBLU, msg1,size,sizeof(unsigned));
			
 
				+	//printf(KCYN "%s%d*%d\n" KBLU, msg1,size,sizeof(unsigned));
			
 
				 	printf(FPGA_OK "%s%lu bytes\n" NORMAL, msg1,size);
			
 
				 
			
 
				 	*ptr =(fpga_mem) malloc(size);
			
@@ -86,9 +86,9 @@ int fpga_allocate_memory(fpga_mem *ptr, size_t size)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-int fpgaGetDeviceProperties(fpgaDeviceProp *props,unsigned devid)
			
 
				+int starpu_fpga_get_device_properties(struct starpu_fpga_device_properties *props, unsigned devid)
			
 
				 {
			
 
				-//TODO
			
 
				+	//TODO
			
 
				         props->totalGlobalMem=1*1024*1024;
			
 
				         props->concurrentKernels=4;
			
 
				         props->name="Fpga_Props_Name";
			
@@ -111,7 +111,7 @@ static void _starpu_fpga_limit_global_mem(unsigned devid)
 
				 {
			
 
				 	starpu_ssize_t limit=-1;
			
 
				 
			
 
				-//TODO
			
 
				+	//TODO
			
 
				 	limit = starpu_get_env_number("STARPU_LIMIT_FPGA_MEM");
			
 
				 	if(limit != -1)
			
 
				 		global_mem[devid] = limit*1024*1024;
			
@@ -124,35 +124,33 @@ static size_t _starpu_fpga_get_global_mem_size(unsigned devid)
 
				 
			
 
				 static void init_fpga_worker_context(unsigned workerid)
			
 
				 {
			
 
				-//		starpu_fpgaStreamCreate(&streams[devid][i]);
			
 
				+	//starpu_fpgaStreamCreate(&streams[devid][i]);
			
 
				 }
			
 
				 
			
 
				 static void init_device_context(unsigned devid)
			
 
				 {
			
 
				-//	int workerid;
			
 
				 	unsigned i;
			
 
				-//TODO: fpgaSetDevice
			
 
				-	fpgaSetDevice(devid);
			
 
				-
			
 
				-//TODO: fpgaGetDeviceProperties
			
 
				-	fpgaGetDeviceProperties(&props[devid], devid);
			
 
				-//TODO: Do we need the streams? I think no
			
 
				-//	cures = starpu_fpgaStreamCreate(&in_transfer_streams[devid]);
			
 
				-//	cures = starpu_fpgaStreamCreate(&out_transfer_streams[devid]);
			
 
				+	//TODO: starpu_fpga_set_device
			
 
				+	starpu_fpga_set_device(devid);
			
 
				+
			
 
				+	starpu_fpga_get_device_properties(&props[devid], devid);
			
 
				+	//TODO: Do we need the streams? I think no
			
 
				+	//cures = starpu_fpgaStreamCreate(&in_transfer_streams[devid]);
			
 
				+	//cures = starpu_fpgaStreamCreate(&out_transfer_streams[devid]);
			
 
				 	for (i = 0; i < nfpgafpgas; i++)
			
 
				 	{
			
 
				-//		starpu_fpgaStreamCreate(&in_peer_transfer_streams[i][devid]);
			
 
				-//		starpu_fpgaStreamCreate(&out_peer_transfer_streams[devid][i]);
			
 
				+		//starpu_fpgaStreamCreate(&in_peer_transfer_streams[i][devid]);
			
 
				+		//starpu_fpgaStreamCreate(&out_peer_transfer_streams[devid][i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 int _starpu_fpga_driver_init(struct _starpu_worker *worker)
			
 
				 {
			
 
				 	int devid = worker->devid;
			
 
				-//fpga_msg("successful till here");
			
 
				+	//fpga_msg("successful till here");
			
 
				 	_starpu_driver_start(worker, _STARPU_FUT_CPU_KEY, 1);
			
 
				 	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
			
 
				-// TODO: drop test when we allocated a memory node for fpga
			
 
				+	// TODO: drop test when we allocated a memory node for fpga
			
 
				 	if (worker->memory_node != STARPU_MAIN_RAM)
			
 
				 		_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_fpga_get_global_mem_size(worker->devid));
			
 
				 
			
@@ -199,7 +197,7 @@ static int execute_job_on_fpga(struct _starpu_job *j, struct starpu_task *worker
 
				 	{
			
 
				 		_starpu_cl_func_t func = _starpu_task_get_fpga_nth_implementation(cl, j->nimpl);
			
 
				 		//char *kernel_type = _starpu_task_get_fpga_kernel_type_nth_implementation(cl, j->nimpl);
			
 
				-//printf("chanel reserved: %d \n",chnl);
			
 
				+		//printf("chanel reserved: %d \n",chnl);
			
 
				 
			
 
				 		STARPU_ASSERT_MSG(func, "when STARPU_FPGA is defined in 'where', fpga_func or fpga_funcs has to be defined");
			
 
				 		if (_starpu_get_disable_kernels() <= 0)
			
@@ -369,10 +367,10 @@ uintptr_t _starpu_fpga_allocate_memory(unsigned dst_node, size_t size, int flags
 
				 	STARPU_ASSERT(devid == 0); // For now
			
 
				 
			
 
				 	/* 0 would be seen as NULL, i.e. allocation failed... */
			
 
				-// FIXME: Maxeler FPGAs want 192-byte alignment
			
 
				+	// FIXME: Maxeler FPGAs want 192-byte alignment
			
 
				 	static fpga_mem current_address = 8192*192;
			
 
				 	fpga_mem addr;
			
 
				-// TODO: vérifier si current_address + size > taille de la LMEm
			
 
				+	// TODO: vérifier si current_address + size > taille de la LMEm
			
 
				  	addr = current_address;
			
 
				 	current_address += size;
			
 
				 	printf("fpga mem returned from allocation @: %p - %p\n",addr, addr + size);
			
--- a/src/drivers/max/driver_fpga.h
+++ b/src/drivers/max/driver_fpga.h
@@ -36,7 +36,6 @@ typedef unsigned * fpga_mem;
 
				 extern struct _starpu_driver_ops _starpu_driver_fpga_ops;
			
 
				 extern struct _starpu_node_ops _starpu_driver_fpga_node_ops;
			
 
				 
			
 
				-int fpgaSetDevice(unsigned devid);
			
 
				 void _starpu_init_fpga(void);
			
 
				 #ifdef STARPU_USE_FPGA
			
 
				 void _starpu_fpga_discover_devices (struct _starpu_machine_config *config);