Browse Source

Merge branch 'fpga' of gitlab.inria.fr:starpu/starpu into fpga

Nathalie Furmento 4 years ago
parent
commit
80265025c0

+ 35 - 34
configure.ac

@@ -114,28 +114,25 @@ fi
 ###############################################################################
 
 #with or without automatic data transfers
-AC_MSG_CHECKING(Automatic Data transfers for Fpga driver)
-AC_ARG_ENABLE(autofpga, [AS_HELP_STRING([--enable-autofpga=<number>],
-			[set 0 to disable])],
-			autofpga=$enableval, autofpga=1)
+AC_MSG_CHECKING(automatic data transfers for FPGA driver)
+AC_ARG_ENABLE(autofpga, [AS_HELP_STRING([--disable-autofpga],
+			[disable automatic data transfers for FPGA driver])],
+			autofpga=$enableval, autofpga=yes)
 AC_MSG_RESULT($autofpga)
-
-AC_DEFINE_UNQUOTED(STARPU_AUTOFPGA, [$autofpga],
-		[automatic data transfer for Fpga])
+if test x$autofpga = xyes ; then
+   AC_DEFINE(STARPU_AUTOFPGA, [1], [automatic data transfer for Fpga])
+fi
 
 #NUMBER OF FPGA DEVICES
-AC_MSG_CHECKING(maximum number of Fpga devices)
+AC_MSG_CHECKING(maximum number of FPGA devices)
 AC_ARG_ENABLE(maxfpgadev, [AS_HELP_STRING([--enable-maxfpgadev=<number>],
 			[maximum number of FPGA devices])],
 			nmaxfpgadev=$enableval, nmaxfpgadev=12)
 AC_MSG_RESULT($nmaxfpga)
+AC_DEFINE_UNQUOTED(STARPU_MAXFPGADEVS, [$nmaxfpgadev],[maximum number of FPGA devices])
 
-#nmaxfpgadev=60
-AC_DEFINE_UNQUOTED(STARPU_MAXFPGADEVS, [$nmaxfpgadev],
-		[maximum number of Fpga devices])
-		
 AC_ARG_WITH([fpga],
-	[AS_HELP_STRING([--with-fpga=<path>],[specify where Fpga lib is installed])],
+	[AS_HELP_STRING([--with-fpga=<path>],[specify where FPGA lib is installed])],
 	[fpga_dir="$withval"
 	enable_fpga=yes],
 	[enable_fpga=no]
@@ -146,17 +143,8 @@ AC_ARG_ENABLE(link-with-riffa, [AS_HELP_STRING([--disable-link-with-riffa],
 AC_ARG_ENABLE(link-with-maxeler, [AS_HELP_STRING([--disable-link-with-maxeler],
 	      [link with MAXELER])], [link_with_maxeler=$enableval], [link_with_maxeler=yes])
 
-
-AC_MSG_CHECKING(whether Fpga should be used)
-AC_MSG_RESULT($enable_fpga)
-AC_SUBST(STARPU_USE_FPGA,$enable_fpga)
-AM_CONDITIONAL(STARPU_USE_FPGA,test x$enable_fpga = xyes)
-if test x$enable_fpga = xyes; then
-   	AC_DEFINE(STARPU_USE_FPGA,[1],[Trying to set STARPU_USE_FPGA])
-fi
-
-if test x$enable_fpga = xyes; then
-
+if test x$enable_fpga = xyes
+then
    	fpga_include_dir="${fpga_dir}/include"
 	fpga_lib_dir="${fpga_dir}/lib"
 
@@ -170,7 +158,7 @@ if test x$enable_fpga = xyes; then
 	   STARPU_FPGA_CPPFLAGS="`slic-config --cflags | sed s/\'//g | sed "s/-I /-I/"` $STARPU_FPGA_CPPFLAGS"
    	   STARPU_FPGA_LDFLAGS="`slic-config --libs | sed s/\'//g | sed "s/-L /-L/" | sed "s/-L /-L/"`"
 	else
-   	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lrt -lm"	
+   	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lrt -lm"
 	fi
 
 	CPPFLAGS="${CPPFLAGS} ${STARPU_FPGA_CPPFLAGS} "
@@ -185,19 +173,32 @@ if test x$enable_fpga = xyes; then
 			[have_valid_fpga="yes"],
 			[have_valid_fpga="no"]
 			)
-		AC_MSG_CHECKING(whether Fpga is working)
-		if test x$have_valid_fpga = xyes; then
-	   	   	AC_MSG_RESULT([:-)])
-	   	else
-			AC_MSG_RESULT([;(])
-		fi
-
-		LDFLAGS="${SAVED_LDFLAGS}"
-		CPPFLAGS="${SAVED_CPPFLAGS}"
+		AC_MSG_CHECKING(whether FPGA is working)
+		AC_MSG_RESULT($have_valid_fpga)
 
 		AC_SUBST(STARPU_FPGA_CPPFLAGS)
 		AC_SUBST(STARPU_FPGA_LDFLAGS)
+	else
+		AC_MSG_CHECKING(whether FPGA is installed)
+		AC_MSG_RESULT([no])
 	fi
+	LDFLAGS="${SAVED_LDFLAGS}"
+	CPPFLAGS="${SAVED_CPPFLAGS}"
+
+	# in case FPGA was explicitely required, but is not available, this is an error
+	if test x$enable_fpga = xyes -a x$have_valid_fpga = xno; then
+		AC_MSG_ERROR([cannot find FPGA])
+    	fi
+	# now we enable FPGA if and only if a proper setup is available
+	enable_fpga=$have_valid_fpga
+fi
+
+AC_MSG_CHECKING(whether FPGA should be used)
+AC_MSG_RESULT($enable_fpga)
+AC_SUBST(STARPU_USE_FPGA,$enable_fpga)
+AM_CONDITIONAL(STARPU_USE_FPGA,test x$enable_fpga = xyes)
+if test x$enable_fpga = xyes; then
+   	AC_DEFINE(STARPU_USE_FPGA,[1],[FPGA support is activated])
 fi
 
 

+ 0 - 1
doc/doxygen/Makefile.am

@@ -106,7 +106,6 @@ chapters =	\
 	chapters/code/disk_compute.c \
 	chapters/code/nf_initexit.f90 \
 	chapters/api/fft_support.doxy \
-	chapters/api/fpga_extensions.doxy \
 	chapters/api/versioning.doxy \
 	chapters/api/threads.doxy
 

+ 118 - 52
doc/doxygen/chapters/440_fpga_support.doxy

@@ -17,15 +17,34 @@
 /*! \page FPGASupport FPGA Support
 
 \section Introduction Introduction
-Maxeler provides hardware and software solutions for accelerating computing applications on dataflow engines (DFEs). DFEs are in-house designed accelerators that encapsulate reconfigurable high-end FPGAs at their core and are equipped with large amounts of DDR memory.
-We extend the StarPU task programming library that initially targets heterogeneous architectures to support Field Programmable Gate Array (FPGA). 
-To create <c>StarPU/FPGA</c> applications exploiting DFE configurations, MaxCompiler allows an application to be split into three parts:
 
-- <c>Kernel</c>, which implements the computational components of the application in hardware.
-- <c>Manager configuration</c>, which connects Kernels to the CPU, engine RAM, other Kernels and other DFEs via MaxRing.
-- <c>CPU application</c>, which interacts with the DFEs to read and write data to the Kernels and engine RAM.
-
-The Simple Live CPU interface (SLiC) is Maxeler’s application programming interface for seamless CPU-DFE integration. SLiC allows CPU applications to configure and load a number of DFEs as well as to subsequently schedule and run actions on those DFEs using simple function calls. In StarPU/FPGA applications, we use <c>Dynamic SLiC Interface</c> to exchange data streams between the CPU (Main Memory) and DFE (Local Memory).
+Maxeler provides hardware and software solutions for accelerating
+computing applications on dataflow engines (DFEs). DFEs are in-house
+designed accelerators that encapsulate reconfigurable high-end FPGAs
+at their core and are equipped with large amounts of DDR memory.
+
+We extend the StarPU task programming library that initially targets
+heterogeneous architectures to support Field Programmable Gate Array
+(FPGA).
+
+To create <c>StarPU/FPGA</c> applications exploiting DFE
+configurations, MaxCompiler allows an application to be split into
+three parts:
+
+- <c>Kernel</c>, which implements the computational components of the
+  application in hardware.
+- <c>Manager configuration</c>, which connects Kernels to the CPU,
+  engine RAM, other Kernels and other DFEs via MaxRing.
+- <c>CPU application</c>, which interacts with the DFEs to read and
+  write data to the Kernels and engine RAM.
+
+The Simple Live CPU interface (SLiC) is Maxeler’s application
+programming interface for seamless CPU-DFE integration. SLiC allows
+CPU applications to configure and load a number of DFEs as well as to
+subsequently schedule and run actions on those DFEs using simple
+function calls. In StarPU/FPGA applications, we use <em>Dynamic SLiC
+Interface</em> to exchange data streams between the CPU (Main Memory)
+and DFE (Local Memory).
 
 \section PortingApplicationsToFPGA Porting Applications to FPGA
 
@@ -43,12 +62,22 @@ struct starpu_codelet cl =
 
 \subsection FPGAExample StarPU/FPGA Application
 
-To give you an idea of the interface that we used to exchange data between <c>host</c> (CPU) and <c>FPGA</c> (DFE), here is an example, based on one of the examples of Maxeler (https://trac.version.fz-juelich.de/reconfigurable/wiki/Public). 
-<c>StreamFMAKernel.maxj</c> represents the Java kernel code; it implements a very simple kernel (c=a+b), and <c>Test.c</c> starts it from the <c>fpga_add</c> function; it first sets streaming up from the CPU pointers, triggers execution and waits for the result. The API to interact with DFEs is called <c>SLiC</c> which then also involves the <c> MaxelerOS</c> runtime.
+To give you an idea of the interface that we used to exchange data
+between <c>host</c> (CPU) and <c>FPGA</c> (DFE), here is an example,
+based on one of the examples of Maxeler
+(https://trac.version.fz-juelich.de/reconfigurable/wiki/Public).
 
+<c>StreamFMAKernel.maxj</c> represents the Java kernel code; it
+implements a very simple kernel (<c>c=a+b</c>), and <c>Test.c</c> starts it
+from the <c>fpga_add</c> function; it first sets streaming up from the
+CPU pointers, triggers execution and waits for the result. The API to
+interact with DFEs is called <em>SLiC</em> which then also involves the
+<c>MaxelerOS</c> runtime.
 
-- <c>StreamFMAKernel.maxj</c>: the DFE part is described in the MaxJ programming language which is a Java-based metaprogramming approach.
-\code{.c}
+- <c>StreamFMAKernel.maxj</c>: the DFE part is described in the MaxJ
+  programming language which is a Java-based metaprogramming approach.
+
+\code{.java}
 package tests;
 
 import com.maxeler.maxcompiler.v2.kernelcompiler.Kernel;
@@ -56,11 +85,13 @@ import com.maxeler.maxcompiler.v2.kernelcompiler.KernelParameters;
 import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEType;
 import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEVar;
 
-class StreamFMAKernel extends Kernel {
+class StreamFMAKernel extends Kernel
+{
 
    private static final DFEType type = dfeInt(32);
 
-   protected StreamFMAKernel(KernelParameters parameters) {
+   protected StreamFMAKernel(KernelParameters parameters)
+   {
              super(parameters);
 
 	     DFEVar a = io.input("a", type);
@@ -70,25 +101,27 @@ class StreamFMAKernel extends Kernel {
 	     c = a+b;
 
 	     io.output("output", c, type);
-	}
-
+   }
 }
-
 \endcode
 
-- <c>StreamFMAManager.maxj</c>: is also described in the MaxJ programming language and orchestrates data movement between the host and the DFE.
-\code{.c}
+- <c>StreamFMAManager.maxj</c>: is also described in the MaxJ
+  programming language and orchestrates data movement between the host
+  and the DFE.
+
+\code{.java}
 package tests;
 
 import com.maxeler.maxcompiler.v2.build.EngineParameters;
 import com.maxeler.maxcompiler.v2.managers.custom.blocks.KernelBlock;
 import com.maxeler.platform.max5.manager.Max5LimaManager;
 
-class StreamFMAManager extends Max5LimaManager {
-
+class StreamFMAManager extends Max5LimaManager
+{
 	private static final String kernel_name = "StreamFMAKernel";
 
-	public StreamFMAManager(EngineParameters arg0) {
+	public StreamFMAManager(EngineParameters arg0)
+	{
 		super(arg0);
 		KernelBlock kernel = addKernel(new StreamFMAKernel(makeKernelParameters(kernel_name)));
 		kernel.getInput("a") <== addStreamFromCPU("a");
@@ -96,41 +129,54 @@ class StreamFMAManager extends Max5LimaManager {
 		addStreamToCPU("output") <== kernel.getOutput("output");
 	}
 
-	public static void main(String[] args) {
+	public static void main(String[] args)
+	{
 		StreamFMAManager manager = new StreamFMAManager(new EngineParameters(args));
 		manager.build();
 	}
 }
 \endcode
 
-Once <c>StreamFMAKernel.maxj</c> and <c>StreamFMAManager.maxj</c> are written, there are other steps to do:
+Once <c>StreamFMAKernel.maxj</c> and <c>StreamFMAManager.maxj</c> are
+written, there are other steps to do:
 
 - Building the JAVA program: (for Kernel and Manager (.maxj))
 \verbatim
 $ maxjc -1.7 -cp $MAXCLASSPATH streamfma/
 \endverbatim
 
-- Running the Java program to generate a DFE implementation (a .max file) that can be called from a StarPU/FPGA application and slic headers (.h) for simulation:
+- Running the Java program to generate a DFE implementation (a .max
+  file) that can be called from a StarPU/FPGA application and slic
+  headers (.h) for simulation:
+
 \verbatim
 $ java -XX:+UseSerialGC -Xmx2048m -cp $MAXCLASSPATH:. streamfma.StreamFMAManager DFEModel=MAIA maxFileName=StreamFMA target=DFE_SIM
 \endverbatim
 
-- Build the slic object file (simulation): 
+- Build the slic object file (simulation):
+
 \verbatim
 $ sliccompile StreamFMA.max
 \endverbatim
 
 - <c>Test.c </c>:
-to interface StarPU task-based runtime system with Maxeler's DFE devices, we use the advanced dynamic interface of <c>SLiC</c> in <b>non_blocking</b> mode.  
-Test code must include <c>MaxSLiCInterface.h</c> and <c>MaxFile.h</c>. The .max file contains the bitstream. The StarPU/FPGA application can be written in C, C++, etc.
+
+to interface StarPU task-based runtime system with Maxeler's DFE
+devices, we use the advanced dynamic interface of <em>SLiC</em> in
+<b>non_blocking</b> mode.
+
+Test code must include <c>MaxSLiCInterface.h</c> and <c>MaxFile.h</c>.
+The .max file contains the bitstream. The StarPU/FPGA application can
+be written in C, C++, etc.
+
 \code{.c}
 #include "StreamFMA.h"
 #include "MaxSLiCInterface.h"
 
 void fpga_add(void *buffers[], void *cl_arg)
-{   
+{
     (void)cl_arg;
-    
+
     int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
     int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
     int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
@@ -142,11 +188,11 @@ void fpga_add(void *buffers[], void *cl_arg)
 
     /* set the number of ticks for a kernel */
     max_set_ticks  (act, "StreamFMAKernel", size);
-    
+
     /* send input streams */
-    max_queue_input(act, "a", a, size *sizeof(a[0])); 
+    max_queue_input(act, "a", a, size *sizeof(a[0]));
     max_queue_input(act, "b", b, size*sizeof(b[0]));
-    
+
     /* store output stream */
     max_queue_output(act,"output", c, size*sizeof(c[0]));
 
@@ -158,7 +204,6 @@ void fpga_add(void *buffers[], void *cl_arg)
 
     printf("*** wait for the actions on DFE to complete *** \n");
     max_wait(run0);
-     
   }
 
   static struct starpu_codelet cl =
@@ -172,14 +217,13 @@ void fpga_add(void *buffers[], void *cl_arg)
 
 int main(int argc, char **argv)
 {
- 
     ...
 
     /* Implementation of a maxfile */
     max_file_t *maxfile = StreamFMA_init();
 
     /* Implementation of an engine */
-    max_engine_t *engine = max_load(maxfile, "*"); 
+    max_engine_t *engine = max_load(maxfile, "*");
 
     starpu_init(NULL);
 
@@ -192,19 +236,26 @@ int main(int argc, char **argv)
 
     /* unload and deallocate an engine obtained by way of max_load */
     max_unload(engine);
-    
+
     return 0;
 }
 \endcode
 
-To write the StarPU/FPGA application: first, the programmer must describe the codelet using StarPU’s C API. This codelet provides both a CPU implementation and an FPGA one. It also specifies that the task has two inputs and one output through the <c>nbuffers</c> and <c>modes</c> attributes.
+To write the StarPU/FPGA application: first, the programmer must
+describe the codelet using StarPU’s C API. This codelet provides both
+a CPU implementation and an FPGA one. It also specifies that the task
+has two inputs and one output through the starpu_codelet::nbuffers and
+starpu_codelet::modes attributes.
 
-<c>fpga_add</c> function is the name of the FPGA implementation and is mainly divided in four steps:
+<c>fpga_add</c> function is the name of the FPGA implementation and is
+mainly divided in four steps:
 
 - Init actions to be run on DFE.
 - Add data to an input stream for an action.
 - Add data storage space for an output stream.
-- Run actions on DFE in <b>non_blocking</b> mode; a non-blocking call returns immediately, allowing the calling code to do more CPU work in parallel while the actions are run.
+- Run actions on DFE in <b>non_blocking</b> mode; a non-blocking call
+  returns immediately, allowing the calling code to do more CPU work
+  in parallel while the actions are run.
 - Wait for the actions to complete.
 
 In the <c>main</c> function, there are four important steps:
@@ -214,31 +265,46 @@ In the <c>main</c> function, there are four important steps:
 - Free actions.
 - Unload and deallocate the DFE.
 
-The rest of the application (data registration, task submission, etc.) is as usual with StarPU
+The rest of the application (data registration, task submission, etc.)
+is as usual with StarPU.
 
 \subsection FPGADataTransfers Data Transfers in StarPU/FPGA Applications
 
-The communication between the host and the DFE is done through the <c>Dynamic advance interface</c> to exchange data between the main memory and the local memory of the DFE.
-For instant, we use \ref STARPU_MAIN_RAM to send and store data to/from DFE's local memory. However, we aim to use a multiplexer to choose which memory node we will use to read/write data. So, the user can tell that the computational kernel will take data from the main memory or DFE's local memory for example.
+The communication between the host and the DFE is done through the
+<em>Dynamic advance interface</em> to exchange data between the main
+memory and the local memory of the DFE.
+
+For the moment, we use \ref STARPU_MAIN_RAM to send and store data
+to/from DFE's local memory. However, we aim to use a multiplexer to
+choose which memory node we will use to read/write data. So, the user
+can tell that the computational kernel will take data from the main
+memory or DFE's local memory for example.
 
-In starPU applications, When \ref starpu_codelet::specific_nodes is 1, this specifies the memory nodes where each data should be sent to for task execution.
-  
+In StarPU applications, when \ref starpu_codelet::specific_nodes is
+set to 1, this specifies the memory nodes where each data should be
+sent to for task execution.
 
 \subsection FPGAConfiguration FPGA Configuration
 
-To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c> through the \c configure option <b>"--with-fpga"</b>.
+To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c>
+through the \c configure option \ref with-fpga "--with-fpga".
+
+Compiling and installing StarPU/FPGA application is done following the
+standard procedure:
 
-Compiling and installing StarPU/FPGA application is done following the standard procedure:
 \verbatim
 $ make
 $ make install
 \endverbatim
 
-
 \subsection FPGALaunchingprograms  Launching Programs: Simulation
 
-Maxeler provides a simple tutorial to use MaxCompiler (https://trac.version.fz-juelich.de/reconfigurable/wiki/Public). Running the Java program to generate maxfile and slic headers (hardware) on Maxeler's DFE device, takes a VERY long time, approx. 2 hours even for this very small example. That's why we use the simulation.  
-
+Maxeler provides a simple tutorial to use MaxCompiler
+(https://trac.version.fz-juelich.de/reconfigurable/wiki/Public).
+Running the Java program to generate maxfile and slic headers
+(hardware) on Maxeler's DFE device, takes a VERY long time, approx. 2
+hours even for this very small example. That's why we use the
+simulation.
 
 - To start the simulation on Maxeler's DFE device:
 \verbatim
@@ -256,8 +322,8 @@ cores by setting the \ref STARPU_NCPU environment variable to 0.
 \verbatim
 $ STARPU_NCPU=0 ./StreamFMA
 \endverbatim
- 
-- To stop the simulation 
+
+- To stop the simulation
 \verbatim
 $ maxcompilersim -c LIMA -n StreamFMA stop
 \endverbatim

+ 8 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -370,6 +370,14 @@ the macro ::STARPU_MAXNODES. Reducing it allows to considerably reduce memory
 used by StarPU data structures.
 </dd>
 
+<dt>--with-fpga=<c>dir</c></dt>
+<dd>
+\anchor with-fpga
+\addindex __configure__--with-fpga
+Enable the FPGA driver support, and optionally specify the location of
+the FPGA library.
+</dd>
+
 </dl>
 
 \section ExtensionConfiguration Extension Configuration

+ 0 - 28
doc/doxygen/chapters/api/fpga_extensions.doxy

@@ -1,28 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_FPGA_Extensions
-
-\def STARPU_USE_FPGA
-\ingroup API_FPGA_Extensions
-Defined when StarPU has been installed with FPGA support.
-It should be used in your code to detect the availability of FPGA.
-
-\def STARPU_MAXFPGADEVS
-\ingroup API_FPGA_Extensions
-Define the maximum number of FPGA devices that are supported by StarPU.
-
-*/

+ 0 - 5
examples/Makefile.am

@@ -538,11 +538,6 @@ nobase_STARPU_OPENCL_DATA_DATA += 		\
 	basic_examples/block_opencl_kernel.cl
 endif
 
-if STARPU_USE_FPGA
-basic_examples_mmult_SOURCES =                                    \
-	basic_examples/mult-fpga.c
-endif
-
 ####################
 # Variable example #
 ####################

+ 0 - 396
examples/basic_examples/mult-fpga.c

@@ -1,396 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- * Copyright (C) 2010       Mehdi Juhoor <mjuhoor@gmail.com>
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*
- * This example shows a simple implementation of a blocked matrix
- * multiplication. Note that this is NOT intended to be an efficient
- * implementation of sgemm! In this example, we show:
- *  - how to declare dense matrices (starpu_matrix_data_register)
- *  - how to manipulate matrices within codelets (eg. descr[0].blas.ld)
- *  - how to use filters to partition the matrices into blocks
- *    (starpu_data_partition and starpu_data_map_filters)
- *  - how to unpartition data (starpu_data_unpartition) and how to stop
- *    monitoring data (starpu_data_unregister)
- *  - how to manipulate subsets of data (starpu_data_get_sub_data)
- *  - how to construct an autocalibrated performance model (starpu_perfmodel)
- *  - how to submit asynchronous tasks
- */
-
-#include <string.h>
-#include <math.h>
-#include <sys/types.h>
-#include <signal.h>
-
-#include <starpu.h>
-
-static float *A, *B, *C;
-static starpu_data_handle_t A_handle, B_handle, C_handle;
-
-static unsigned nslicesx = 4;
-static unsigned nslicesy = 4;
-#ifdef STARPU_QUICK_CHECK
-static unsigned xdim = 512;
-static unsigned ydim = 512;
-static unsigned zdim = 256;
-#else
-static unsigned xdim = 1024;
-static unsigned ydim = 1024;
-static unsigned zdim = 512;
-#endif
-
-
-/*
- * That program should compute C = A * B 
- * 
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
-
- */
-
-/*
- * The codelet is passed 3 matrices, the "descr" union-type field gives a
- * description of the layout of those 3 matrices in the local memory (ie. RAM
- * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
- * registered data with the "matrix" data interface, we use the matrix macros.
- */
-
-void cpu_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED  void *arg)
-{
-	float *subA, *subB, *subC;
-	uint32_t nxC, nyC, nyA;
-	uint32_t ldA, ldB, ldC;
-
-	/* .blas.ptr gives a pointer to the first element of the local copy */
-	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
-	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
-
-	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
-	 * is the number of lines that are separated by .blas.ld elements (ld
-	 * stands for leading dimension).
-	 * NB: in case some filters were used, the leading dimension is not
-	 * guaranteed to be the same in main memory (on the original matrix)
-	 * and on the accelerator! */
-	nxC = STARPU_MATRIX_GET_NX(descr[2]);
-	nyC = STARPU_MATRIX_GET_NY(descr[2]);
-	nyA = STARPU_MATRIX_GET_NY(descr[0]);
-
-	ldA = STARPU_MATRIX_GET_LD(descr[0]);
-	ldB = STARPU_MATRIX_GET_LD(descr[1]);
-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
-
-	/* we assume a FORTRAN-ordering! */
-	unsigned i,j,k;
-	for (i = 0; i < nyC; i++)
-	{
-		for (j = 0; j < nxC; j++)
-		{
-			float sum = 0.0;
-
-			for (k = 0; k < nyA; k++)
-			{
-				sum += subA[j+k*ldA]*subB[k+i*ldB];
-			}
-
-			subC[j + i*ldC] = sum;
-		}
-	}
-}
-
-static void init_problem_data(void)
-{
-	unsigned i,j;
-
-	/* we initialize matrices A, B and C in the usual way */
-
-	A = (float *) malloc(zdim*ydim*sizeof(float));
-	B = (float *) malloc(xdim*zdim*sizeof(float));
-	C = (float *) malloc(xdim*ydim*sizeof(float));
-
-	/* fill the A and B matrices */
-	srand(2009);
-	for (j=0; j < ydim; j++)
-	{
-		for (i=0; i < zdim; i++)
-		{
-			A[j+i*ydim] = (float)(starpu_drand48());
-		}
-	}
-
-	for (j=0; j < zdim; j++)
-	{
-		for (i=0; i < xdim; i++)
-		{
-			B[j+i*zdim] = (float)(starpu_drand48());
-		}
-	}
-
-	for (j=0; j < ydim; j++)
-	{
-		for (i=0; i < xdim; i++)
-		{
-			C[j+i*ydim] = (float)(0);
-		}
-	}
-}
-
-static void partition_mult_data(void)
-{
-	/* note that we assume a FORTRAN ordering here! */
-
-	/* The BLAS data interface is described by 4 parameters: 
-	 *  - the location of the first element of the matrix to monitor (3rd
-	 *    argument)
-	 *  - the number of elements between columns, aka leading dimension
-	 *    (4th arg)
-	 *  - the number of (contiguous) elements per column, ie. contiguous
-	 *  elements (5th arg)
-	 *  - the number of columns (6th arg)
-	 * The first elements is a pointer to the data_handle that will be
-	 * associated to the matrix, and the second elements gives the memory
-	 * node in which resides the matrix: 0 means that the 3rd argument is
-	 * an adress in main memory.
-	 */
-	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, 
-		ydim, ydim, zdim, sizeof(float));
-	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B, 
-		zdim, zdim, xdim, sizeof(float));
-	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C, 
-		ydim, ydim, xdim, sizeof(float));
-
-	/* A filter is a method to partition a data into disjoint chunks, it is
-	 * described by the means of the "struct starpu_data_filter" structure that
-	 * contains a function that is applied on a data handle to partition it
-	 * into smaller chunks, and an argument that is passed to the function
-	 * (eg. the number of blocks to create here).
-	 */
-
-	/* StarPU supplies some basic filters such as the partition of a matrix
-	 * into blocks, note that we are using a FORTRAN ordering so that the
-	 * name of the filters are a bit misleading */
-	struct starpu_data_filter vert =
-	{
-		.filter_func = starpu_matrix_filter_vertical_block,
-		.nchildren = nslicesx
-	};
-
-	struct starpu_data_filter horiz =
-	{
-		.filter_func = starpu_matrix_filter_block,
-		.nchildren = nslicesy
-	};
-
-/*
- *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
- *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
- *	the number of filters to apply, and the indexes for each filters, for
- *	instance:
- *
- *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1); 
- *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2); 
- *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1); 
- *
- *	Note that here we applied 2 filters recursively onto C.
- *
- *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
- *	of blocked matrix C for example.
- *
- *		              |---|---|---|---|
- *		              |   |   | B'|   | B
- *		              |---|---|---|---|
- *		                0   1   2   3
- *		     |----|   |---|---|---|---|
- *		     |    |   |   |   |   |   |
- *		     |    | 0 |   |   |   |   |
- *		     |----|   |---|---|---|---|
- *		     | A' |   |   |   | C'|   |
- *		     |    |   |   |   |   |   |
- *		     |----|   |---|---|---|---|
- *		       A              C
- *
- *	IMPORTANT: applying filters is equivalent to partitionning a piece of
- *	data in a hierarchical manner, so that memory consistency is enforced
- *	for each of the elements independantly. The tasks should therefore NOT
- *	access inner nodes (eg. one column of C or the whole C) but only the
- *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
- *	possible by disapplying the filters (using starpu_data_unpartition), to
- *	enforce memory consistency.
- */
-
-	starpu_data_partition(B_handle, &vert);
-	starpu_data_partition(A_handle, &horiz);
-
-	/* starpu_data_map_filters is a variable-arity function, the first argument
-	 * is the handle of the data to partition, the second argument is the
-	 * number of filters to apply recursively. Filters are applied in the
-	 * same order as the arguments.
-	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
-	 * then applying horiz on each sub-data (ie. each column of C)
-	 */
-	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
-}
-
-static struct starpu_perfmodel mult_perf_model =
-{
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "mult_perf_model"
-};
-
-static struct starpu_codelet cl =
-{
-        /* we can only execute that kernel on a CPU yet */
-        /* CPU implementation of the codelet */
-        .cpu_funcs = {cpu_mult},
-        .cpu_funcs_name = {"cpu_mult"},
-        /* the codelet manipulates 3 buffers that are managed by the
-         * DSM */
-        .nbuffers = 3,
-	.modes = {STARPU_R, STARPU_R, STARPU_W},
-        /* in case the scheduling policy may use performance models */
-        .model = &mult_perf_model
-};
-
-static int launch_tasks(void)
-{
-	int ret;
-	/* partition the work into slices */
-	unsigned taskx, tasky;
-
-	for (taskx = 0; taskx < nslicesx; taskx++) 
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			/* C[taskx, tasky] = A[tasky] B[taskx] */
-
-			/* by default, starpu_task_create() returns an
- 			 * asynchronous task (ie. task->synchronous = 0) */
-			struct starpu_task *task = starpu_task_create();
-
-			/* this task implements codelet "cl" */
-			task->cl = &cl;
-
-			/*
-			 *              |---|---|---|---|
-			 *              |   | * |   |   | B
-			 *              |---|---|---|---|
-			 *                    X 
-			 *     |----|   |---|---|---|---|
-			 *     |****| Y |   |***|   |   |
-			 *     |****|   |   |***|   |   |
-			 *     |----|   |---|---|---|---|
-			 *     |    |   |   |   |   |   |
-			 *     |    |   |   |   |   |   |
-			 *     |----|   |---|---|---|---|
-			 *       A              C
-			 */
-
-			/* there was a single filter applied to matrices A
-			 * (respectively B) so we grab the handle to the chunk
-			 * identified by "tasky" (respectively "taskx). The "1"
-			 * tells StarPU that there is a single argument to the
-			 * variable-arity function starpu_data_get_sub_data */
-			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
-			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
-
-			/* 2 filters were applied on matrix C, so we give
-			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
-			 * must match the order in which the filters were
-			 * applied.
-			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
-			 * a handle to the column number k of matrix C.
-			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
-			 * equivalent to
-			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
-			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
-
-			/* this is not a blocking call since task->synchronous = 0 */
-			ret = starpu_task_submit(task);
-			if (ret == -ENODEV) return ret;
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-		}
-	}
-	return 0;
-}
-
-int main(STARPU_ATTRIBUTE_UNUSED int argc, 
-	 STARPU_ATTRIBUTE_UNUSED char **argv)
-{
-	int ret;
-
-	/* start the runtime */
-	ret = starpu_init(NULL);
-	if (ret == -ENODEV)
-		return 77;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-	/* initialize matrices A, B and C and register them to StarPU */
-	init_problem_data();
-
-	/* partition matrices into blocks that can be manipulated by the
- 	 * codelets */
-	partition_mult_data();
-
-	/* submit all tasks in an asynchronous fashion */
-	ret = launch_tasks();
-	if (ret == -ENODEV) goto enodev;
-
-	/* wait for termination */
-        starpu_task_wait_for_all();
-
-	/* remove the filters applied by the means of starpu_data_map_filters; now
- 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
-	 * starpu_data_map_filters is called again on C_handle.
-	 * The second argument is the memory node where the different subsets
-	 * should be reassembled, 0 = main memory (RAM) */
-	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
-	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
-	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
-
-	/* stop monitoring matrix C : after this, it is not possible to pass C 
-	 * (or any subset of C) as a codelet input/output. This also implements
-	 * a barrier so that the piece of data is put back into main memory in
-	 * case it was only available on a GPU for instance. */
-	starpu_data_unregister(A_handle);
-	starpu_data_unregister(B_handle);
-	starpu_data_unregister(C_handle);
-
-	free(A);
-	free(B);
-	free(C);
-
-	starpu_shutdown();
-
-	return 0;
-
-enodev:
-	starpu_shutdown();
-	return 77;
-}
-

+ 7 - 1
examples/mlr/mlr.c

@@ -185,7 +185,7 @@ int main(void)
 		vector_mn[1] = n;
 		starpu_data_release(vector_mn_handle);
 
-		for (j = 0; j < 42; j++)
+		for (j = 0; j < 1000; j++)
 		{
 			starpu_insert_task(&cl_init,
 					   STARPU_R, vector_mn_handle,
@@ -202,5 +202,11 @@ int main(void)
 	free(vector_mn);
 	starpu_shutdown();
 
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	starpu_perfmodel_dump_xml(stdout, &cl_model_final);
+	starpu_shutdown();
+
 	return 0;
 }

+ 1 - 1
include/starpu_clusters.h

@@ -122,7 +122,7 @@ struct starpu_cluster_machine* starpu_cluster_machine(hwloc_obj_type_t cluster_l
 int starpu_uncluster_machine(struct starpu_cluster_machine* clusters);
 int starpu_cluster_print(struct starpu_cluster_machine* clusters);
 
-/* Prologue functions */
+/** Prologue functions */
 void starpu_openmp_prologue(void*);
 #define starpu_intel_openmp_mkl_prologue starpu_openmp_prologue
 #ifdef STARPU_MKL

+ 16 - 5
include/starpu_config.h.in

@@ -78,6 +78,12 @@
    @ingroup API_MPI_Support
 */
 #undef STARPU_USE_MPI_MASTER_SLAVE
+
+/**
+   Defined when StarPU has been installed with FPGA support. It should
+   be used in your code to detect the availability of FPGA.
+   @ingroup API_FPGA_Extensions
+*/
 #undef STARPU_USE_FPGA
 
 /**
@@ -212,11 +218,16 @@
 #undef STARPU_MAXNUMANODES
 
 /**
- * Define the maximum number of CUDA devices that are supported by StarPU.
- * @ingroup API_CUDA_Extensions
- */
+   Define the maximum number of CUDA devices that are supported by StarPU.
+   @ingroup API_CUDA_Extensions
+*/
 #undef STARPU_MAXCUDADEVS
 
+/**
+   Define the maximum number of FPGA devices that are supported by
+   StarPU.
+   @ingroup API_FPGA_Extensions
+ */
 #undef STARPU_MAXFPGADEVS
 
 /**
@@ -307,10 +318,10 @@ typedef ssize_t starpu_ssize_t;
 #undef STARPU_PTHREAD_COND_INITIALIZER_ZERO
 #undef STARPU_PTHREAD_RWLOCK_INITIALIZER_ZERO
 
-/* This is only for building examples */
+/** This is only for building examples */
 #undef STARPU_HAVE_HELGRIND_H
 
-/* Enable Fortran to C MPI interface */
+/** Enable Fortran to C MPI interface */
 #undef  HAVE_MPI_COMM_F2C
 
 #undef STARPU_HAVE_DARWIN

+ 3 - 1
include/starpu_data.h

@@ -115,7 +115,9 @@ enum starpu_data_access_mode
 
 struct starpu_data_interface_ops;
 
-/** Set the name of the data, to be shown in various profiling tools. */
+/**
+   Set the name of the data, to be shown in various profiling tools.
+*/
 void starpu_data_set_name(starpu_data_handle_t handle, const char *name);
 
 /**

+ 9 - 10
include/starpu_fpga.h

@@ -33,37 +33,36 @@ extern "C"
 */
 
 //int starpu_fpga_allocate_memory(fpga_mem *addr, size_t size);
-typedef struct data_exchange_log
+
+struct starpu_fpga_data_trans
 {
         int size;
         float time;
-} fpga_trans;
+};
 
-typedef struct properties
+struct starpu_fpga_device_properties
 {
         int totalGlobalMem;
         int concurrentKernels;
         char *name;
-} fpgaDeviceProp;
+};
 
 /**
    get device properties
  */
+int starpu_fpga_get_device_properties(struct starpu_fpga_device_properties *properties, unsigned devid);
 
-int fpgaGetDeviceProperties(fpgaDeviceProp *,unsigned devid);
-
-void fpga_report_configuration(void);
+void starpu_fpga_report_configuration(void);
 
 /**
    set fpga device
  */
-int fpgaSetDevice(unsigned devid);
+int starpu_fpga_set_device(unsigned devid);
 
 /**
    fpga is silent
  */
-
-int fpga_is_silent();
+int starpu_fpga_is_silent();
 
 /** @} */
 

+ 1 - 1
src/core/perfmodel/multiple_regression.c

@@ -329,7 +329,7 @@ int _starpu_multiple_regression(struct starpu_perfmodel_history_list *ptr, doubl
 		/* Basic validation of the model accuracy */
 		starpu_validate_mlr(coeff, ncoeff, codelet_name);
 #else
-		_STARPU_DISP("Warning: StarPU was compiled with '--disable-mlr' option or on Windows machine, thus multiple linear regression model will not be computed.\n");
+		_STARPU_DISP("Warning: StarPU was compiled without '--enable-mlr' option, thus multiple linear regression model will not be computed.\n");
 		for(i=0; i<ncoeff; i++)
 			coeff[i] = 0.;
 #endif //STARPU_MLR_MODEL

+ 8 - 4
src/core/perfmodel/perfmodel_history.c

@@ -1010,6 +1010,8 @@ static void dump_per_arch_model_xml(FILE *f, struct starpu_perfmodel *model, int
 
 void starpu_perfmodel_dump_xml(FILE *f, struct starpu_perfmodel *model)
 {
+	_starpu_init_and_load_perfmodel(model);
+
 	fprintf(f, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
 	fprintf(f, "<!DOCTYPE StarPUPerfmodel SYSTEM \"starpu-perfmodel.dtd\">\n");
 	fprintf(f, "<!-- symbol %s -->\n", model->symbol);
@@ -1496,6 +1498,8 @@ int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *mo
 
 	if (ret)
 		starpu_perfmodel_unload_model(model);
+	else
+		model->is_loaded = 1;
 	return ret;
 }
 
@@ -1611,7 +1615,7 @@ docal:
 		char archname[STR_SHORT_LENGTH];
 
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements from size %lu to %lu), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, (unsigned long) size, regmodel?regmodel->nsample:0, regmodel?regmodel->minx:0, regmodel?regmodel->maxx:0);
+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements from size %lu to %lu), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this. You probably need to run again to continue calibrating the model, until this warning disappears.\n", model->symbol, archname, (unsigned long) size, regmodel?regmodel->nsample:0, regmodel?regmodel->minx:0, regmodel?regmodel->maxx:0);
 		_starpu_set_calibrate_flag(1);
 		model->benchmarking = 1;
 	}
@@ -1664,7 +1668,7 @@ docal:
 			char archname[STR_SHORT_LENGTH];
 
 			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-			_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, (unsigned long) size, entry && entry->history_entry ? entry->history_entry->nsample : 0);
+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this. You probably need to run again to continue calibrating the model, until this warning disappears.\n", model->symbol, archname, (unsigned long) size, entry && entry->history_entry ? entry->history_entry->nsample : 0);
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 		}
@@ -1711,7 +1715,7 @@ docal:
 		char archname[STR_SHORT_LENGTH];
 
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname);
+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this. You probably need to run again to continue calibrating the model, until this warning disappears.\n", model->symbol, archname);
 		_starpu_set_calibrate_flag(1);
 		model->benchmarking = 1;
 	}
@@ -1794,7 +1798,7 @@ docal:
 		char archname[STR_SHORT_LENGTH];
 
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %ld footprint %x (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, j->task?(long int)_starpu_job_get_data_size(model, arch, nimpl, j):-1, key, entry ? entry->nsample : 0);
+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %ld footprint %x (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this. You probably need to run again to continue calibrating the model, until this warning disappears.\n", model->symbol, archname, j->task?(long int)_starpu_job_get_data_size(model, arch, nimpl, j):-1, key, entry ? entry->nsample : 0);
 		_starpu_set_calibrate_flag(1);
 		model->benchmarking = 1;
 	}

+ 1 - 0
src/core/perfmodel/perfmodel_print.c

@@ -172,6 +172,7 @@ void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmo
 
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output)
 {
+	_starpu_init_and_load_perfmodel(model);
 	if (arch == NULL)
 	{
 		int comb, impl;

+ 21 - 23
src/drivers/max/driver_fpga.c

@@ -47,7 +47,7 @@
 //#define STARPU_MAXFPGADEVS 4
 /* the number of FPGA devices */
 static unsigned  nfpgafpgas = -1;
-static fpgaDeviceProp props[STARPU_MAXFPGADEVS];
+static struct starpu_fpga_device_properties props[STARPU_MAXFPGADEVS];
 static size_t global_mem[STARPU_MAXFPGADEVS] = { 128ULL*1024*1024*1024 };
 
 static void _starpu_fpga_limit_global_mem(unsigned );
@@ -72,9 +72,9 @@ void _starpu_init_fpga()
 #if 0
 int fpga_allocate_memory(fpga_mem *ptr, size_t size)
 {
-//This allocates BYTES
+	//This allocates BYTES
 	char *msg1="You asked to allocate ";
-//	printf(KCYN "%s%d*%d\n" KBLU, msg1,size,sizeof(unsigned));
+	//printf(KCYN "%s%d*%d\n" KBLU, msg1,size,sizeof(unsigned));
 	printf(FPGA_OK "%s%lu bytes\n" NORMAL, msg1,size);
 
 	*ptr =(fpga_mem) malloc(size);
@@ -86,9 +86,9 @@ int fpga_allocate_memory(fpga_mem *ptr, size_t size)
 }
 #endif
 
-int fpgaGetDeviceProperties(fpgaDeviceProp *props,unsigned devid)
+int starpu_fpga_get_device_properties(struct starpu_fpga_device_properties *props, unsigned devid)
 {
-//TODO
+	//TODO
         props->totalGlobalMem=1*1024*1024;
         props->concurrentKernels=4;
         props->name="Fpga_Props_Name";
@@ -111,7 +111,7 @@ static void _starpu_fpga_limit_global_mem(unsigned devid)
 {
 	starpu_ssize_t limit=-1;
 
-//TODO
+	//TODO
 	limit = starpu_get_env_number("STARPU_LIMIT_FPGA_MEM");
 	if(limit != -1)
 		global_mem[devid] = limit*1024*1024;
@@ -124,35 +124,33 @@ static size_t _starpu_fpga_get_global_mem_size(unsigned devid)
 
 static void init_fpga_worker_context(unsigned workerid)
 {
-//		starpu_fpgaStreamCreate(&streams[devid][i]);
+	//starpu_fpgaStreamCreate(&streams[devid][i]);
 }
 
 static void init_device_context(unsigned devid)
 {
-//	int workerid;
 	unsigned i;
-//TODO: fpgaSetDevice
-	fpgaSetDevice(devid);
-
-//TODO: fpgaGetDeviceProperties
-	fpgaGetDeviceProperties(&props[devid], devid);
-//TODO: Do we need the streams? I think no
-//	cures = starpu_fpgaStreamCreate(&in_transfer_streams[devid]);
-//	cures = starpu_fpgaStreamCreate(&out_transfer_streams[devid]);
+	//TODO: starpu_fpga_set_device
+	starpu_fpga_set_device(devid);
+
+	starpu_fpga_get_device_properties(&props[devid], devid);
+	//TODO: Do we need the streams? I think no
+	//cures = starpu_fpgaStreamCreate(&in_transfer_streams[devid]);
+	//cures = starpu_fpgaStreamCreate(&out_transfer_streams[devid]);
 	for (i = 0; i < nfpgafpgas; i++)
 	{
-//		starpu_fpgaStreamCreate(&in_peer_transfer_streams[i][devid]);
-//		starpu_fpgaStreamCreate(&out_peer_transfer_streams[devid][i]);
+		//starpu_fpgaStreamCreate(&in_peer_transfer_streams[i][devid]);
+		//starpu_fpgaStreamCreate(&out_peer_transfer_streams[devid][i]);
 	}
 }
 
 int _starpu_fpga_driver_init(struct _starpu_worker *worker)
 {
 	int devid = worker->devid;
-//fpga_msg("successful till here");
+	//fpga_msg("successful till here");
 	_starpu_driver_start(worker, _STARPU_FUT_CPU_KEY, 1);
 	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
-// TODO: drop test when we allocated a memory node for fpga
+	// TODO: drop test when we allocated a memory node for fpga
 	if (worker->memory_node != STARPU_MAIN_RAM)
 		_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_fpga_get_global_mem_size(worker->devid));
 
@@ -199,7 +197,7 @@ static int execute_job_on_fpga(struct _starpu_job *j, struct starpu_task *worker
 	{
 		_starpu_cl_func_t func = _starpu_task_get_fpga_nth_implementation(cl, j->nimpl);
 		//char *kernel_type = _starpu_task_get_fpga_kernel_type_nth_implementation(cl, j->nimpl);
-//printf("chanel reserved: %d \n",chnl);
+		//printf("chanel reserved: %d \n",chnl);
 
 		STARPU_ASSERT_MSG(func, "when STARPU_FPGA is defined in 'where', fpga_func or fpga_funcs has to be defined");
 		if (_starpu_get_disable_kernels() <= 0)
@@ -369,10 +367,10 @@ uintptr_t _starpu_fpga_allocate_memory(unsigned dst_node, size_t size, int flags
 	STARPU_ASSERT(devid == 0); // For now
 
 	/* 0 would be seen as NULL, i.e. allocation failed... */
-// FIXME: Maxeler FPGAs want 192-byte alignment
+	// FIXME: Maxeler FPGAs want 192-byte alignment
 	static fpga_mem current_address = 8192*192;
 	fpga_mem addr;
-// TODO: vérifier si current_address + size > taille de la LMEm
+	// TODO: vérifier si current_address + size > taille de la LMEm
  	addr = current_address;
 	current_address += size;
 	printf("fpga mem returned from allocation @: %p - %p\n",addr, addr + size);

+ 0 - 1
src/drivers/max/driver_fpga.h

@@ -36,7 +36,6 @@ typedef unsigned * fpga_mem;
 extern struct _starpu_driver_ops _starpu_driver_fpga_ops;
 extern struct _starpu_node_ops _starpu_driver_fpga_node_ops;
 
-int fpgaSetDevice(unsigned devid);
 void _starpu_init_fpga(void);
 #ifdef STARPU_USE_FPGA
 void _starpu_fpga_discover_devices (struct _starpu_machine_config *config);