Browse Source

Re-add fpga documentation

Samuel Thibault 5 years ago
parent
commit
f75a955923

+ 48 - 0
doc/doxygen/440_fpga_support.doxy

@@ -0,0 +1,48 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                          CNRS
+ * Copyright (C) 2019                          Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*! \page FPGASupport FPGA Support
+
+\section FPGA FPGA 
+
+The use of specialized hardware such as accelerators or coprocessors offers an
+interesting approach to overcome the physical limits encountered by processor
+architects. As a result, many machines are now equipped with one or several
+accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
+efforts have been devoted to offload computation onto such accelerators, very
+little attention as been paid to portability concerns on the one hand, and to the
+possibility of having heterogeneous accelerators and processors to interact on the other hand.
+
+ In addition, StarPU comes with programming language support, in the form of an OpenCL front-end (\ref FPGAextensions).
+
+\section PortingApplicationsToFPGA Porting Applications To FPGA
+
+The way to port an application to FPGA is to set the field
+starpu_codelet::fpga_funcs, to provide StarPU with the function
+for FPGA implementation, so for instance:
+
+\verbatim
+struct starpu_codelet cl =
+{
+    .fpga_funcs = {myfunc},
+    .nbuffers = 1,
+}
+\endverbatim
+
+
+
+*/

+ 5 - 2
doc/doxygen/Makefile.am

@@ -73,8 +73,9 @@ chapters =	\
 	chapters/415_fault_tolerance.doxy	\
 	chapters/420_fft_support.doxy		\
 	chapters/430_mic_support.doxy		\
-	chapters/450_native_fortran_support.doxy		\
-	chapters/460_socl_opencl_extensions.doxy		\
+	chapters/440_fpga_support.doxy		\
+	chapters/450_native_fortran_support.doxy              \
+        chapters/460_socl_opencl_extensions.doxy		\
 	chapters/470_simgrid.doxy		\
 	chapters/480_openmp_runtime_support.doxy		\
 	chapters/490_clustering_a_machine.doxy		\
@@ -97,6 +98,7 @@ chapters =	\
 	chapters/code/disk_compute.c \
 	chapters/code/nf_initexit.f90 \
 	chapters/api/fft_support.doxy \
+	chapters/api/fpga_extensions.doxy \
 	chapters/api/versioning.doxy \
 	chapters/api/threads.doxy
 
@@ -201,6 +203,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_fxt.h		\
 	$(top_srcdir)/include/starpu_hash.h		\
 	$(top_srcdir)/include/starpu_mic.h		\
+	$(top_srcdir)/include/starpu_fpga.h		\
 	$(top_srcdir)/include/starpu_mod.f90		\
 	$(top_srcdir)/include/starpu_opencl.h		\
 	$(top_srcdir)/include/starpu_openmp.h		\

+ 267 - 0
doc/doxygen/chapters/440_fpga_support.doxy

@@ -0,0 +1,267 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                          CNRS
+ * Copyright (C) 2019                          Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*! \page FPGASupport FPGA Support
+
+\section Introduction Introduction
+Maxeler provides hardware and software solutions for accelerating computing applications on dataflow engines (DFEs). DFEs are in-house designed accelerators that encapsulate reconfigurable high-end FPGAs at their core and are equipped with large amounts of DDR memory.
+We extend the StarPU task programming library that initially targets heterogeneous architectures to support Field Programmable Gate Array (FPGA). 
+To create <c>StarPU/FPGA</c> applications exploiting DFE configurations, MaxCompiler allows an application to be split into three parts:
+
+- <c>Kernel</c>, which implements the computational components of the application in hardware.
+- <c>Manager configuration</c>, which connects Kernels to the CPU, engine RAM, other Kernels and other DFEs via MaxRing.
+- <c>CPU application</c>, which interacts with the DFEs to read and write data to the Kernels and engine RAM.
+
+The Simple Live CPU interface (SLiC) is Maxeler’s application programming interface for seamless CPU-DFE integration. SLiC allows CPU applications to configure and load a number of DFEs as well as to subsequently schedule and run actions on those DFEs using simple function calls. In StarPU/FPGA applications, we use <c>Dynamic SLiC Interface</c> to exchange data streams between the CPU (Main Memory) and DFE (Local Memory).
+
+\section PortingApplicationsToFPGA Porting Applications to FPGA
+
+The way to port an application to FPGA is to set the field
+starpu_codelet::fpga_funcs, to provide StarPU with the function
+for FPGA implementation, so for instance:
+
+\verbatim
+struct starpu_codelet cl =
+{
+    .fpga_funcs = {myfunc},
+    .nbuffers = 1,
+}
+\endverbatim
+
+\subsection FPGAExample StarPU/FPGA Application
+
+To give you an idea of the interface that we used to exchange data between <c>host</c> (CPU) and <c>FPGA</c> (DFE), here is an example, based on one of the examples of Maxeler (https://trac.version.fz-juelich.de/reconfigurable/wiki/Public). 
+<c>StreamFMAKernel.maxj</c> represents the Java kernel code; it implements a very simple kernel (c=a+b), and <c>Test.c</c> starts it from the <c>fpga_add</c> function; it first sets streaming up from the CPU pointers, triggers execution and waits for the result. The API to interact with DFEs is called <c>SLiC</c> which then also involves the <c> MaxelerOS</c> runtime.
+
+
+- <c>StreamFMAKernel.maxj</c>: the DFE part is described in the MaxJ programming language which is a Java-based metaprogramming approach.
+\code{.c}
+package tests;
+
+import com.maxeler.maxcompiler.v2.kernelcompiler.Kernel;
+import com.maxeler.maxcompiler.v2.kernelcompiler.KernelParameters;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEType;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEVar;
+
+class StreamFMAKernel extends Kernel {
+
+   private static final DFEType type = dfeInt(32);
+
+   protected StreamFMAKernel(KernelParameters parameters) {
+             super(parameters);
+
+	     DFEVar a = io.input("a", type);
+	     DFEVar b = io.input("b", type);
+	     DFEVar c;
+
+	     c = a+b;
+
+	     io.output("output", c, type);
+	}
+
+}
+
+\endcode
+
+- <c>StreamFMAManager.maxj</c>: is also described in the MaxJ programming language and orchestrates data movement between the host and the DFE.
+\code{.c}
+package tests;
+
+import com.maxeler.maxcompiler.v2.build.EngineParameters;
+import com.maxeler.maxcompiler.v2.managers.custom.blocks.KernelBlock;
+import com.maxeler.platform.max5.manager.Max5LimaManager;
+
+class StreamFMAManager extends Max5LimaManager {
+
+	private static final String kernel_name = "StreamFMAKernel";
+
+	public StreamFMAManager(EngineParameters arg0) {
+		super(arg0);
+		KernelBlock kernel = addKernel(new StreamFMAKernel(makeKernelParameters(kernel_name)));
+		kernel.getInput("a") <== addStreamFromCPU("a");
+		kernel.getInput("b") <== addStreamFromCPU("b");
+		addStreamToCPU("output") <== kernel.getOutput("output");
+	}
+
+	public static void main(String[] args) {
+		StreamFMAManager manager = new StreamFMAManager(new EngineParameters(args));
+		manager.build();
+	}
+}
+\endcode
+
+Once <c>StreamFMAKernel.maxj</c> and <c>StreamFMAManager.maxj</c> are written, there are other steps to do:
+
+- Building the JAVA program: (for Kernel and Manager (.maxj))
+\verbatim
+$ maxjc -1.7 -cp $MAXCLASSPATH streamfma/
+\endverbatim
+
+- Running the Java program to generate a DFE implementation (a .max file) that can be called from a StarPU/FPGA application and slic headers (.h) for simulation:
+\verbatim
+$ java -XX:+UseSerialGC -Xmx2048m -cp $MAXCLASSPATH:. streamfma.StreamFMAManager DFEModel=MAIA maxFileName=StreamFMA target=DFE_SIM
+\endverbatim
+
+- Build the slic object file (simulation): 
+\verbatim
+$ sliccompile StreamFMA.max
+\endverbatim
+
+- <c>Test.c </c>:
+to interface StarPU task-based runtime system with Maxeler's DFE devices, we use the advanced dynamic interface of <c>SLiC</c> in <b>non_blocking</b> mode.  
+Test code must include <c>MaxSLiCInterface.h</c> and <c>MaxFile.h</c>. The .max file contains the bitstream. The StarPU/FPGA application can be written in C, C++, etc.
+\code{.c}
+#include "StreamFMA.h"
+#include "MaxSLiCInterface.h"
+
+void fpga_add(void *buffers[], void *cl_arg)
+{   
+    (void)cl_arg;
+    
+    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
+    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
+    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
+
+    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+
+    /* actions to run on an engine */
+    max_actions_t *act = max_actions_init(maxfile, NULL);
+
+    /* set the number of ticks for a kernel */
+    max_set_ticks  (act, "StreamFMAKernel", size);
+    
+    /* send input streams */
+    max_queue_input(act, "a", a, size *sizeof(a[0])); 
+    max_queue_input(act, "b", b, size*sizeof(b[0]));
+    
+    /* store output stream */
+    max_queue_output(act,"output", c, size*sizeof(c[0]));
+
+    /* run actions on the engine */
+     printf("**** Run actions in non blocking mode **** \n");
+
+    /* run actions in non_blocking mode */
+    max_run_t *run0= max_run_nonblock(engine, act);
+
+    printf("*** wait for the actions on DFE to complete *** \n");
+    max_wait(run0);
+     
+  }
+
+  static struct starpu_codelet cl =
+  {
+    .cpu_funcs = {cpu_func},
+    .cpu_funcs_name = {"cpu_func"},
+    .fpga_funcs = {fpga_add},
+    .nbuffers = 3,
+    .modes = {STARPU_R, STARPU_R, STARPU_W}
+  };
+
+int main(int argc, char **argv)
+{
+ 
+    ...
+
+    /* Implementation of a maxfile */
+    max_file_t *maxfile = StreamFMA_init();
+
+    /* Implementation of an engine */
+    max_engine_t *engine = max_load(maxfile, "*"); 
+
+    starpu_init(NULL);
+
+    ... Task submission etc. ...
+
+    starpu_shutdown();
+
+    /* deallocate the set of actions */
+    max_actions_free(act);
+
+    /* unload and deallocate an engine obtained by way of max_load */
+    max_unload(engine);
+    
+    return 0;
+}
+\endcode
+
+To write the StarPU/FPGA application: first, the programmer must describe the codelet using StarPU’s C API. This codelet provides both a CPU implementation and an FPGA one. It also specifies that the task has two inputs and one output through the <c>nbuffers</c> and <c>modes</c> attributes.
+
+<c>fpga_add</c> function is the name of the FPGA implementation and is mainly divided in four steps:
+
+- Init actions to be run on DFE.
+- Add data to an input stream for an action.
+- Add data storage space for an output stream.
+- Run actions on DFE in <b>non_blocking</b> mode; a non-blocking call returns immediately, allowing the calling code to do more CPU work in parallel while the actions are run.
+- Wait for the actions to complete.
+
+In the <c>main</c> function, there are four important steps:
+
+- Implement a maxfile.
+- Load a DFE.
+- Free actions.
+- Unload and deallocate the DFE.
+
+The rest of the application (data registration, task submission, etc.) is as usual with StarPU
+
+\subsection FPGADataTransfers Data Transfers in StarPU/FPGA Applications
+
+The communication between the host and the DFE is done through the <c>Dynamic advance interface</c> to exchange data between the main memory and the local memory of the DFE.
+For instant, we use \ref STARPU_MAIN_RAM to send and store data to/from DFE's local memory. However, we aim to use a multiplexer to choose which memory node we will use to read/write data. So, the user can tell that the computational kernel will take data from the main memory or DFE's local memory for example.
+
+In starPU applications, When \ref starpu_codelet::specific_nodes is 1, this specifies the memory nodes where each data should be sent to for task execution.
+  
+
+\subsection FPGAConfiguration FPGA Configuration
+
+To configure StarPU with FPGA accelerators, we can enable <c>FPGA</c> through the \c configure option <b>"--with-fpga"</b>.
+
+Compiling and installing StarPU/FPGA application is done following the standard procedure:
+\verbatim
+$ make
+$ make install
+\endverbatim
+
+
+\subsection FPGALaunchingprograms  Launching Programs: Simulation
+
+Maxeler provides a simple tutorial to use MaxCompiler (https://trac.version.fz-juelich.de/reconfigurable/wiki/Public). Running the Java program to generate maxfile and slic headers (hardware) on Maxeler's DFE device, takes a VERY long time, approx. 2 hours even for this very small example. That's why we use the simulation.  
+
+
+- To start the simulation on Maxeler's DFE device:
+\verbatim
+$ maxcompilersim -c LIMA -n StreamFMA restart
+\endverbatim
+
+- To run the binary (simulation)
+\verbatim
+$ export LD_LIBRARY_PATH=$MAXELEROSDIR/lib:$LD_LIBRARY_PATH
+$ export SLIC_CONF="use_simulation=StreamFMA"
+\endverbatim
+
+- To force tasks to be scheduled on the FPGA, one can disable the use of CPU
+cores by setting the \ref STARPU_NCPU environment variable to 0.
+\verbatim
+$ STARPU_NCPU=0 ./StreamFMA
+\endverbatim
+ 
+- To stop the simulation 
+\verbatim
+$ maxcompilersim -c LIMA -n StreamFMA stop
+\endverbatim
+
+
+*/

+ 29 - 0
doc/doxygen/chapters/api/fpga_extensions.doxy

@@ -0,0 +1,29 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     CNRS
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*! \ingroup API_FPGA_Extensions
+
+\def STARPU_USE_FPGA
+\ingroup API_FPGA_Extensions
+Defined when StarPU has been installed with FPGA support.
+It should be used in your code to detect the availability of FPGA.
+
+\def STARPU_MAXFPGADEVS
+\ingroup API_FPGA_Extensions
+Define the maximum number of FPGA devices that are supported by StarPU.
+
+*/

+ 1 - 0
doc/doxygen/doxygen-config.cfg.in

@@ -32,6 +32,7 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu_disk.h \
 			 @top_srcdir@/include/starpu_driver.h \
 			 @top_srcdir@/include/starpu_expert.h \
+			 @top_srcdir@/include/starpu_fpga.h \
 			 @top_srcdir@/include/starpu_fxt.h \
 			 @top_srcdir@/include/starpu.h \
 			 @top_srcdir@/include/starpu_hash.h \

+ 1 - 0
doc/doxygen/doxygen.cfg

@@ -1616,6 +1616,7 @@ INCLUDE_FILE_PATTERNS  =
 PREDEFINED             = STARPU_USE_OPENCL=1 \
                          STARPU_USE_CUDA=1 \
                          STARPU_USE_MIC=1 \
+                         STARPU_USE_FPGA=1 \
 			 STARPU_USE_MPI=1 \
 			 STARPU_HAVE_HWLOC=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \

+ 7 - 0
doc/doxygen/refman.tex

@@ -163,6 +163,11 @@ Documentation License”.
 \hypertarget{MICSupport}{}
 \input{MICSupport}
 
+\chapter{FPGA Support}
+\label{FPGASupport}
+\hypertarget{FPGASupport}{}
+\input{FPGASupport}
+
 \chapter{Native Fortran Support}
 \label{NativeFortranSupport}
 \hypertarget{NativeFortranSupport}{}
@@ -238,6 +243,7 @@ Documentation License”.
 \input{group__API__OpenCL__Extensions}
 \input{group__API__OpenMP__Runtime__Support}
 \input{group__API__MIC__Extensions}
+\input{group__API__FPGA__Extensions}
 \input{group__API__Miscellaneous__Helpers}
 \input{group__API__FxT__Support}
 \input{group__API__FFT__Support}
@@ -283,6 +289,7 @@ Documentation License”.
 \input{starpu__disk_8h}
 \input{starpu__driver_8h}
 \input{starpu__expert_8h}
+%\input{starpu__fpga_8h}
 \input{starpu__fxt_8h}
 \input{starpu__hash_8h}
 \input{starpu__helper_8h}