瀏覽代碼

starpu maxeler

mariem makni 5 年之前
父節點
當前提交
0f396a0ef2

+ 2 - 2
configure.ac

@@ -166,8 +166,8 @@ if test x$enable_fpga = xyes; then
 	if test x$link_with_riffa = xyes; then
    	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lriffa -lrt -lm"
 	elif test x$link_with_maxeler = xyes; then
-	   STARPU_FPGA_CPPFLAGS=`slic-config --cflags`" $STARPU_FPGA_CPPFLAGS"
-   	   STARPU_FPGA_LDFLAGS=`slic-config --libs`" -lrt -lm"
+	   STARPU_FPGA_CPPFLAGS="-I/opt/Software/maxeler/maxcompiler-2018.2.1/include/slic $STARPU_FPGA_CPPFLAGS -I/home/jusers/makni1/jumax/makni/starpu.git/tests/perfmodels/simulation"
+   	   STARPU_FPGA_LDFLAGS=" -L/opt/Software/maxeler/maxcompiler-2018.2.1/lib -lslic -L/opt/Software/maxeler/maxcompiler-2018.2.1/lib/maxeleros-sim/lib -lmaxeleros -lpthread -lm -lrt"
 	else
    	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lrt -lm"	
 	fi

+ 1 - 99
examples/Makefile.am

@@ -275,15 +275,7 @@ endif
 
 if STARPU_HAVE_FC
 if !STARPU_SANITIZE
-STARPU_EXAMPLES +=				\
-	fortran90/f90_example			\
-	native_fortran/nf_vector		\
-	native_fortran/nf_matrix		\
-	native_fortran/nf_example		\
-	native_fortran/nf_dynbuf		\
-	native_fortran/nf_varbuf		\
-	native_fortran/nf_sched_ctx		\
-	native_fortran/nf_partition
+
 endif
 endif
 endif
@@ -437,41 +429,6 @@ fortran90_f90_example_SOURCES =	\
 	fortran90/marshalling.c		\
 	fortran90/f90_example.f90
 
-native_fortran_nf_vector_SOURCES =	\
-	native_fortran/nf_codelets.f90		\
-	$(top_srcdir)/include/fstarpu_mod.f90	\
-	native_fortran/nf_vector.f90
-
-native_fortran_nf_matrix_SOURCES =	\
-	native_fortran/nf_codelets.f90		\
-	$(top_srcdir)/include/fstarpu_mod.f90	\
-	native_fortran/nf_matrix.f90
-
-native_fortran_nf_example_SOURCES =	\
-	native_fortran/nf_types.f90		\
-	native_fortran/nf_compute.f90		\
-	$(top_srcdir)/include/fstarpu_mod.f90	\
-	native_fortran/nf_example.f90
-
-native_fortran_nf_dynbuf_SOURCES =	\
-	native_fortran/nf_dynbuf_cl.f90		\
-	$(top_srcdir)/include/fstarpu_mod.f90	\
-	native_fortran/nf_dynbuf.f90
-
-native_fortran_nf_varbuf_SOURCES =	\
-	native_fortran/nf_varbuf_cl.f90		\
-	$(top_srcdir)/include/fstarpu_mod.f90	\
-	native_fortran/nf_varbuf.f90
-
-native_fortran_nf_sched_ctx_SOURCES =	\
-	native_fortran/nf_sched_ctx_cl.f90		\
-	$(top_srcdir)/include/fstarpu_mod.f90	\
-	native_fortran/nf_sched_ctx.f90
-
-native_fortran_nf_partition_SOURCES =	\
-	native_fortran/nf_partition_cl.f90		\
-	$(top_srcdir)/include/fstarpu_mod.f90	\
-	native_fortran/nf_partition.f90
 endif
 
 #######################
@@ -1106,59 +1063,4 @@ mod_compute.o: $(top_srcdir)/examples/fortran90/mod_compute.f90 mod_types.mod mo
 f90_example.o: $(top_srcdir)/examples/fortran90/f90_example.f90 $(top_srcdir)/examples/fortran90/marshalling.c mod_types.mod mod_interface.mod mod_compute.mod starpu_mod.mod
 	$(AM_V_FC)$(FC) $(fortran90_f90_example_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'fortran90/f90_example.f90' || echo '$(srcdir)/'`fortran90/f90_example.f90
 
-# Native Fortran example
-# - list explicit dependences to control proper module files generation
-# - the overriding rule fully disables the corresponing default rule, thus
-#   the default rule body must be copied entirely
-nf_types.mod: nf_types.o
-nf_compute.mod: nf_compute.o
-fstarpu_mod.mod: fstarpu_mod.o
-nf_codelets.mod: nf_codelets.o
-nf_dynbuf_cl.mod: nf_dynbuf_cl.o
-nf_varbuf_cl.mod: nf_varbuf_cl.o
-nf_sched_ctx_cl.mod: nf_sched_ctx_cl.o
-nf_partition_cl.mod: nf_partition_cl.o
-
-fstarpu_mod.o: $(top_srcdir)/include/fstarpu_mod.f90
-	$(AM_V_FC)$(FC) $(native_fortran_nf_vector_FCFLAGS) $(FCFLAGS) -c -o $@ '$(top_srcdir)/'include/fstarpu_mod.f90
-
-nf_codelets.o: $(top_srcdir)/examples/native_fortran/nf_codelets.f90 fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_vector_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_codelets.f90' || echo '$(srcdir)/'`native_fortran/nf_codelets.f90
-
-nf_vector.o: $(top_srcdir)/examples/native_fortran/nf_vector.f90 nf_codelets.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_vector_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_vector.f90' || echo '$(srcdir)/'`native_fortran/nf_vector.f90
-
-nf_matrix.o: $(top_srcdir)/examples/native_fortran/nf_matrix.f90 nf_codelets.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_matrix_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_matrix.f90' || echo '$(srcdir)/'`native_fortran/nf_matrix.f90
-
-nf_compute.o: $(top_srcdir)/examples/native_fortran/nf_compute.f90 nf_types.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_example_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_compute.f90' || echo '$(srcdir)/'`native_fortran/nf_compute.f90
-
-nf_example.o: $(top_srcdir)/examples/native_fortran/nf_example.f90 nf_types.mod nf_compute.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_example_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_example.f90' || echo '$(srcdir)/'`native_fortran/nf_example.f90
-
-nf_dynbuf_cl.o: $(top_srcdir)/examples/native_fortran/nf_dynbuf_cl.f90 nf_types.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_dynbuf_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_dynbuf_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_dynbuf_cl.f90
-
-nf_dynbuf.o: $(top_srcdir)/examples/native_fortran/nf_dynbuf.f90 nf_types.mod nf_dynbuf_cl.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_dynbuf_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_dynbuf.f90' || echo '$(srcdir)/'`native_fortran/nf_dynbuf.f90
-
-nf_varbuf_cl.o: $(top_srcdir)/examples/native_fortran/nf_varbuf_cl.f90 nf_types.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_varbuf_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_varbuf_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_varbuf_cl.f90
-
-nf_varbuf.o: $(top_srcdir)/examples/native_fortran/nf_varbuf.f90 nf_types.mod nf_varbuf_cl.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_varbuf_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_varbuf.f90' || echo '$(srcdir)/'`native_fortran/nf_varbuf.f90
-
-nf_sched_ctx_cl.o: $(top_srcdir)/examples/native_fortran/nf_sched_ctx_cl.f90 nf_types.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_sched_ctx_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_sched_ctx_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_sched_ctx_cl.f90
-
-nf_sched_ctx.o: $(top_srcdir)/examples/native_fortran/nf_sched_ctx.f90 nf_types.mod nf_sched_ctx_cl.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_sched_ctx_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_sched_ctx.f90' || echo '$(srcdir)/'`native_fortran/nf_sched_ctx.f90
-
-nf_partition_cl.o: $(top_srcdir)/examples/native_fortran/nf_partition_cl.f90 nf_types.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_partition_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_partition_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_partition_cl.f90
-
-nf_partition.o: $(top_srcdir)/examples/native_fortran/nf_partition.f90 nf_types.mod nf_partition_cl.mod fstarpu_mod.mod
-	$(AM_V_FC)$(FC) $(native_fortran_nf_partition_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_partition.f90' || echo '$(srcdir)/'`native_fortran/nf_partition.f90
-
 endif

+ 1 - 1
include/starpu_fpga.h

@@ -28,7 +28,7 @@ extern "C"
 {
 #endif
 
-int starpu_fpga_allocate_memory(int devid, fpga_mem *addr, size_t size);
+int starpu_fpga_allocate_memory(fpga_mem *addr, size_t size);
 #ifdef __cplusplus
 }
 #endif

+ 2 - 2
include/starpu_worker.h

@@ -45,7 +45,7 @@ enum starpu_node_kind
 	STARPU_UNUSED=0,
 	STARPU_CPU_RAM=1,
 	STARPU_CUDA_RAM=2,
-        STARPU_FPGA_RAM=9,
+        STARPU_FPGA_RAM=4,
 	STARPU_OPENCL_RAM=3,
 	STARPU_DISK_RAM=4,
 	STARPU_MIC_RAM=5,
@@ -63,9 +63,9 @@ enum starpu_worker_archtype
 {
 	STARPU_CPU_WORKER=0,        /**< CPU core */
 	STARPU_CUDA_WORKER=1,       /**< NVIDIA CUDA device */
-	STARPU_FPGA_WORKER=1,
 	STARPU_OPENCL_WORKER=2,     /**< OpenCL device */
 	STARPU_MIC_WORKER=3,        /**< Intel MIC device */
+	STARPU_FPGA_WORKER=4,       /**< FPGA device */
 	STARPU_MPI_MS_WORKER=5,     /**< MPI Slave device */
 	STARPU_ANY_WORKER=6         /**< any worker, used in the hypervisor */
 };

+ 2 - 1
src/core/task.c

@@ -458,7 +458,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	}
 	if (some_impl && is_where_unset)
 	{
-		cl->where |= STARPU_FPGA;
+		where |= STARPU_FPGA;
 	}
 
 
@@ -552,6 +552,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	{
 		where |= STARPU_MIC|STARPU_MPI_MS;
 	}
+
 	cl->where = where;
 }
 

+ 18 - 1
src/core/topology.c

@@ -2719,6 +2719,8 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 
 #if defined(STARPU_USE_FPGA) || defined(STARPU_SIMGRID)
 		        case STARPU_FPGA_WORKER:
+			{
+				unsigned numa;
 #ifndef STARPU_SIMGRID
 				if (may_bind_automatically[STARPU_FPGA_WORKER])
 				{
@@ -2738,9 +2740,17 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 				{
 					fpga_init[devid] = 1;
 					workerarg->bindid = fpga_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
-					memory_node = fpga_memory_nodes[devid] = _starpu_memory_node_register(STARPU_FPGA_RAM, devid, &_starpu_driver_cpu_node_ops); 
+
+#if 0
+// TODO: il faut activer ça pour que StarPU se mette à allouer de la mémoire FPGA
+					memory_node = fpga_memory_nodes[devid] = _starpu_memory_node_register(STARPU_FPGA_RAM, devid, &_starpu_driver_fpga_node_ops); 
 					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
 					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+#else
+					// Ici, éventuellement pour l'instant
+					memory_node = STARPU_MAIN_RAM;
+#endif
+
 #ifdef STARPU_SIMGRID
 					snprintf(name, sizeof(name), "Fpga%d", devid);
 					host = _starpu_simgrid_get_host_by_name(name);
@@ -2749,7 +2759,14 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 #endif /* SIMGRID */
 				}
 				_starpu_memory_node_add_nworkers(memory_node);
+
+				//This worker can manage transfers on NUMA nodes
+				for (numa = 0; numa < nb_numa_nodes; numa++)
+						_starpu_worker_drives_memory_node(workerarg, numa);
+
+				_starpu_worker_drives_memory_node(workerarg, memory_node);
 				break;
+			}
 #endif
 
 

+ 5 - 5
src/core/workers.c

@@ -274,11 +274,11 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 		starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, nimpl);
 		return func != NULL;
 	}
-       // case STARPU_FPGA_WORKER:
-	//{
-		//starpu_fpga_func_t func = _starpu_task_get_fpga_nth_implementation(cl, nimpl);
-		//return func != NULL;
-	//}
+        case STARPU_FPGA_WORKER:
+	{
+		starpu_fpga_func_t func = _starpu_task_get_fpga_nth_implementation(cl, nimpl);
+		return func != NULL;
+	}
 	case STARPU_OPENCL_WORKER:
 	{
 		starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, nimpl);

+ 4 - 4
src/datawizard/copy_driver.c

@@ -397,10 +397,10 @@ if (src_kind == STARPU_FPGA_RAM || dst_kind == STARPU_CPU_RAM)
        
 		if (async_data)
 		
-                        return _starpu_fpga_copy_fpga_to_ram_async((void*) (src + src_offset), src_node, (void*) (dst + dst_offset), dst_node, size);
+                        return _starpu_fpga_copy_fpga_to_ram_async((void*) (src + src_offset), (void*) (dst + dst_offset), size);
                       
 		else
-			{return _starpu_fpga_copy_fpga_to_ram((void*) (src + src_offset), src_node, (void*) (dst + dst_offset), dst_node, size); 
+			{return _starpu_fpga_copy_fpga_to_ram((void*) (src + src_offset), (void*) (dst + dst_offset), size); 
                           }  
       }             
                     
@@ -408,9 +408,9 @@ if (src_kind == STARPU_FPGA_RAM || dst_kind == STARPU_CPU_RAM)
         {
       
 		if (async_data)
-			{return _starpu_fpga_copy_ram_to_fpga_async((void*) (src + src_offset), src_node, (void*) (dst + dst_offset), dst_node,	size);}
+			{return _starpu_fpga_copy_ram_to_fpga_async((void*) (src + src_offset), (void*) (dst + dst_offset), size);}
 		else
-			{return _starpu_fpga_copy_ram_to_fpga((void*) (src + src_offset), src_node, (void*) (dst + dst_offset), dst_node, size);  }  
+			{return _starpu_fpga_copy_ram_to_fpga((void*) (src + src_offset), (void*) (dst + dst_offset), size);  }  
       }     
     
 #endif

+ 4 - 22
src/datawizard/malloc.c

@@ -586,26 +586,6 @@ static uintptr_t _starpu_malloc_on_node(unsigned dst_node, size_t size, int flag
 		starpu_memory_deallocate(dst_node, size);
 	}
 
-#if (defined(STARPU_USE_FPGA) || defined(STARPU_SIMGRID))
-	      
-         if (starpu_node_get_kind(dst_node) == STARPU_FPGA_RAM)  
-               {
-                                int ret;
-				fpga_mem ptr;
-
-				ret = starpu_fpga_allocate_memory(devid, &ptr, size);
-				if (ret)
-				{
-					addr = 0;
-				}
-				else
-				{
-					addr = (uintptr_t)ptr;
-				}
-				
-		}
-#endif
-
 	return addr;
 }
 
@@ -776,7 +756,8 @@ static int _starpu_malloc_should_suballoc(unsigned dst_node, size_t size, int fl
 		(starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM
 		 || (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM
 		     && _starpu_malloc_should_pin(flags))
-		 );
+		 )
+	       || starpu_node_get_kind(dst_node) == STARPU_FPGA_RAM;
 }
 
 uintptr_t
@@ -950,7 +931,8 @@ starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int fl
 	{
 		/* This chunk is now empty, but avoid chunk free/alloc
 		 * ping-pong by keeping some of these.  */
-		if (nfreechunks[dst_node] >= CHUNKS_NFREE)
+		if (nfreechunks[dst_node] >= CHUNKS_NFREE && 
+                     starpu_node_get_kind(dst_node) != STARPU_FPGA_RAM)  
 		{
 			/* We already have free chunks, release this one */
 			_starpu_free_on_node_flags(dst_node, chunk->base, CHUNK_SIZE, flags);

+ 136 - 18
src/drivers/max/driver_fpga.c

@@ -29,22 +29,57 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
-
+#include <MaxSLiCInterface.h>
+
+#define KNRM  "\x1B[0m"
+#define KRED  "\x1B[31m"
+#define KGRN  "\x1B[32m"
+#define KYEL  "\x1B[33m"
+#define KBLU  "\x1B[34m"
+#define KMAG  "\x1B[35m"
+#define KCYN  "\x1B[36m"
+#define KWHT  "\x1B[37m"
+
+#define FPGA_OK KGRN
+#define FPGA_ERROR KRED
+#define NORMAL KNRM
+#define FPGA_OK KGRN
 
 //#define STARPU_MAXFPGADEVS 4
 /* the number of FPGA devices */
 static unsigned  nfpgafpgas = -1; 
 static fpgaDeviceProp props[STARPU_MAXFPGADEVS];
-static size_t global_mem[STARPU_MAXFPGADEVS];
+static size_t global_mem[STARPU_MAXFPGADEVS] = { 128*1024*1024*1024 };
+
+void fpga_msg(char *msg){
+	printf(FPGA_OK "%s\n" NORMAL, msg);
+}
 
 void _starpu_init_fpga()
 {
 	nfpgafpgas = starpu_get_env_number("STARPU_NUM_FPGA_FPGA");
 	if(nfpgafpgas == -1)
 		nfpgafpgas =1;
-	STARPU_ASSERT( nfpgafpgas <= STARPU_MAXFPGADEVS);	
+	STARPU_ASSERT( nfpgafpgas <= STARPU_MAXFPGADEVS);
+
+        //LMemInterface addLMemInterface()
+        //// pour récupérer l'accès à la LMem	
 }
 
+int fpga_allocate_memory(fpga_mem *ptr, size_t size){
+//This allocates BYTES
+	char *msg1="You asked to allocate ";
+//	printf(KCYN "%s%d*%d\n" KBLU, msg1,size,sizeof(unsigned));
+	printf(FPGA_OK "%s%d bytes\n" NORMAL, msg1,size);
+
+	*ptr =(fpga_mem) malloc(size);
+  
+        if (*ptr == NULL)
+        	return 0;
+       		else
+		return 1;
+       			  }
+
 void _starpu_fpga_discover_devices (struct _starpu_machine_config *config){
 	//TODO: This is statically assigned, in the next round of integration
 	// I will have to read from the struct fpga in fpga
@@ -58,6 +93,7 @@ unsigned _starpu_fpga_get_device_count(void){
 static void	_starpu_fpga_limit_global_mem(unsigned devid){
 	starpu_ssize_t limit=-1;
 
+//TODO
 	limit = starpu_get_env_number("STARPU_LIMIT_FPGA_MEM");
 	if(limit != -1)
 		global_mem[devid] = limit*1024*1024;
@@ -96,10 +132,13 @@ int _starpu_fpga_driver_init(struct _starpu_worker *worker){
 //fpga_msg("successful till here");
 	_starpu_driver_start(worker, _STARPU_FUT_CPU_KEY, 1);
 	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
-	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_fpga_get_global_mem_size(worker->devid));
+// TODO: drop test when we allocated a memory node for fpga
+	if (worker->memory_node != STARPU_MAIN_RAM)
+		_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_fpga_get_global_mem_size(worker->devid));
 
-	snprintf(worker->name, sizeof(worker->name), "CPU %d", devid);
-	snprintf(worker->short_name, sizeof(worker->short_name), "CPU %d", devid);
+	snprintf(worker->name, sizeof(worker->name), "FPGA %d", devid);
+	snprintf(worker->short_name, sizeof(worker->short_name), "FPGA %d", devid);
+	starpu_pthread_setname(worker->short_name);
 
 	_STARPU_TRACE_WORKER_INIT_END(worker->workerid);
 
@@ -169,9 +208,9 @@ static int execute_job_on_fpga(struct _starpu_job *j, struct starpu_task *worker
 		if (_starpu_get_disable_kernels() <= 0)
 		{
 			_STARPU_TRACE_START_EXECUTING();
-			int chnl = fpga_reserve_chanel_of_kernel_type(kernel_type);
-			_starpu_fpga_transfer_data(_STARPU_TASK_GET_INTERFACES(task), j, chnl);
-			fpga_release_chanel(chnl);
+			//int chnl = fpga_reserve_chanel_of_kernel_type(kernel_type);
+			//_starpu_fpga_transfer_data(_STARPU_TASK_GET_INTERFACES(task), j, chnl);
+			//fpga_release_chanel(chnl);
 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 			_STARPU_TRACE_END_EXECUTING();
 		}
@@ -353,40 +392,85 @@ void *_starpu_fpga_worker(void *_arg){
 	return NULL;
 }
 
-int starpu_fpga_allocate_memory(int devid STARPU_ATTRIBUTE_UNUSED, fpga_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size) 
+int _starpu_fpga_allocate_memory(int devid, fpga_mem *addr, size_t size) 
 {
-	int err = fpga_allocate_memory(mem, size);
-printf("fpga mem returned from allocation @: %p\n",*mem);
+	static fpga_mem current_address = 0;
+// TODO: vérifier si current_address + size > taille de la LMEm
+ 	*addr = current_address;
+	current_address += size;
+printf("fpga mem returned from allocation @: %p\n",*addr);
 //success = 0
-        return !err;
+        return 0;
+}
+
+
+int _starpu_fpga_driver_init_from_worker(struct _starpu_worker *worker)
+{
+	return _starpu_fpga_driver_init(worker->set);
+}
+
+int _starpu_fpga_run_from_worker(struct _starpu_worker *worker)
+{
+	return _starpu_run_fpga(worker->set);
+}
+
+int _starpu_fpga_driver_run_once_from_worker(struct _starpu_worker *worker)
+{
+	return _starpu_fpga_driver_run_once(worker->set);
 }
 
-int _starpu_fpga_copy_ram_to_fpga(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
+int _starpu_fpga_driver_deinit_from_worker(struct _starpu_worker *worker)
+{
+	return _starpu_fpga_driver_deinit(worker->set);
+}
+
+int _starpu_fpga_copy_ram_to_fpga(void *src, void *dst, size_t size)
 {
 printf("ram to fpga, fpga @= %p\n",dst);
 	memcpy(dst,src,size);
 	return 0;
+  // LMemLoopback_writeLMem(dst, size, src);
+}
+
+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
+ *  * node to the address pointed by DST in the DST_NODE memory node
+ *   */
+void copy_ram_to_fpga(int32_t *src, int32_t dst, size_t size)
+{
+printf("ram to fpga, fpga @= %p\n",dst);
+
+      // LMemLoopback_writeLMem(size, dst, src);
+   
 }
 
+void copy_fpga_to_ram(int32_t *src, int32_t dst, size_t size)
+{
+printf("ram to fpga, fpga @= %p\n",dst);
+       //LMemLoopback_readLMem(size, src, dst);
+
+}
 /* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
  * node to the address pointed by DST in the DST_NODE memory node
  */
-int _starpu_fpga_copy_fpga_to_ram(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
+int _starpu_fpga_copy_fpga_to_ram(void *src, void *dst, size_t size)
 {
 printf("fpga to ram, fpga @= %p\n",src);
 	memcpy(dst,src,size);
 	return 0;
+ //LMemLoopback_readLMem(src, size, dst);
 }
 
 /* Asynchronous transfers */
-int _starpu_fpga_copy_ram_to_fpga_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
+int _starpu_fpga_copy_ram_to_fpga_async(void *src, void *dst, size_t size)
 {
 printf("ram to fpga, fpga @= %p\n",dst);
 	memcpy(dst,src,size);
 	return 0;
+
+ // Trouver dans la doc une version asynchrone de LMemLoopback_writeLMem();
 }
 
-int _starpu_fpga_copy_fpga_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
+int _starpu_fpga_copy_fpga_to_ram_async(void *src, void *dst, size_t size)
 {
 printf("fpga to ram, fpga @= %p\n",src);
 	memcpy(dst,src,size);
@@ -419,7 +503,7 @@ void _starpu_fpga_transfer_data(void *buffers[], struct _starpu_job *j, int chnl
 			{
 				void *ptr = STARPU_VARIABLE_GET_PTR(buffers[index]);
 				size_t size = STARPU_VARIABLE_GET_ELEMSIZE(buffers[index]);
-				fpga_data_send(chnl,ptr,size);
+				//fpga_data_send(chnl,ptr,size);
 				printf("Driver Fpga @: %p, size %d \n",ptr,size);
 				break;
 			}
@@ -439,3 +523,37 @@ void _starpu_fpga_transfer_data(void *buffers[], struct _starpu_job *j, int chnl
 
 
 
+struct _starpu_driver_ops _starpu_driver_fpga_ops =
+{
+	.init = _starpu_fpga_driver_init_from_worker,
+	.run = _starpu_fpga_run_from_worker,
+	.run_once = _starpu_fpga_driver_run_once_from_worker,
+	.deinit = _starpu_fpga_driver_deinit_from_worker
+};
+
+// TODO: structure node_ops, comme dans driver_cuda.c, avec starpu_fpga_allocate_memory, _starpu_fpga_copy_ram_to_fpga, etc.
+struct _starpu_node_ops _starpu_driver_fpga_node_ops =
+{
+	.copy_data_to[STARPU_UNUSED] = NULL,
+	.copy_data_to[STARPU_CPU_RAM] = _starpu_fpga_copy_fpga_to_ram,
+	.copy_data_to[STARPU_FPGA_RAM] = _starpu_fpga_copy_ram_to_fpga,
+	.copy_data_to[STARPU_OPENCL_RAM] = NULL,
+	.copy_data_to[STARPU_DISK_RAM] = NULL,
+	.copy_data_to[STARPU_MIC_RAM] = NULL,
+	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
+
+	.copy_interface_to[STARPU_UNUSED] = NULL,
+	.copy_interface_to[STARPU_CPU_RAM] = NULL,
+	.copy_interface_to[STARPU_FPGA_RAM] = NULL,
+	.copy_interface_to[STARPU_OPENCL_RAM] = NULL,
+	.copy_interface_to[STARPU_DISK_RAM] = NULL,
+	.copy_interface_to[STARPU_MIC_RAM] = NULL,
+	.copy_interface_to[STARPU_MPI_MS_RAM] = NULL,
+        
+        .wait_request_completion = NULL,
+	.test_request_completion = NULL,
+	.is_direct_access_supported = NULL,
+	.malloc_on_node = _starpu_fpga_allocate_memory,
+	.free_on_node = NULL,
+	.name = "fpga driver"
+};

+ 11 - 8
src/drivers/max/driver_fpga.h

@@ -29,16 +29,18 @@
 #include <core/task.h>
 #include <datawizard/datawizard.h>
 #include <core/perfmodel/perfmodel.h>
-
 #include <common/fxt.h>
 
+extern struct _starpu_driver_ops _starpu_driver_fpga_ops;
+extern struct _starpu_node_ops _starpu_driver_fpga_node_ops;
+
 void _starpu_init_fpga(void);
 void _starpu_fpga_discover_devices (struct _starpu_machine_config *config);
 unsigned _starpu_fpga_get_device_count(void);
 
-static void	_starpu_fpga_limit_global_mem(unsigned );
+static void _starpu_fpga_limit_global_mem(unsigned );
 static size_t _starpu_fpga_get_global_mem_size(unsigned devid);
-
+int _starpu_fpga_allocate_memory(int devid, fpga_mem *addr, size_t size);
 void *_starpu_fpga_worker(void *);
 struct _starpu_worker;
 int _starpu_run_fpga(struct _starpu_worker *);
@@ -48,10 +50,11 @@ int _starpu_fpga_driver_deinit(struct _starpu_worker *);
 
 void _starpu_fpga_transfer_data(void *buffers[], struct _starpu_job *j, int );
 
-int _starpu_fpga_copy_ram_to_fpga(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size);
-int _starpu_fpga_copy_fpga_to_ram(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
-int _starpu_fpga_copy_ram_to_fpga_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size);
-int _starpu_fpga_copy_fpga_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
-
+int _starpu_fpga_copy_fpga_to_ram(void *src, void *dst, size_t size);
+int _starpu_fpga_copy_ram_to_fpga(void *src, void *dst, size_t size);
+void copy_ram_to_fpga(int32_t *src, int32_t dst, size_t size);
+void copy_ram_to_fpga(int32_t *src, int32_t dst, size_t size);
+int _starpu_fpga_copy_ram_to_fpga_async(void *src, void *dst, size_t size);
+int _starpu_fpga_copy_fpga_to_ram_async(void *src, void *dst, size_t size);
 #endif //  __DRIVER_FPGA_H__
 

+ 4 - 16
tests/Makefile.am

@@ -348,10 +348,8 @@ myPROGRAMS +=				\
 	perfmodels/regression_based		\
 	perfmodels/regression_based_01		\
 	perfmodels/regression_based_02		\
-	perfmodels/regression_based_03		\
-	perfmodels/regression_based_04		\
-	perfmodels/max_fpga              	\
-	perfmodels/max_riffa            	\
+	perfmodels/regression_based_03		\	
+        perfmodels/max_fpga              	\
 	perfmodels/non_linear_regression_based	\
 	perfmodels/feed				\
 	perfmodels/user_base			\
@@ -966,27 +964,17 @@ perfmodels_regression_based_02_SOURCES=\
 perfmodels_regression_based_03_SOURCES=\
 	perfmodels/regression_based_03.c
 
-perfmodels_regression_based_04_SOURCES=\
-	perfmodels/regression_based_04.c
-
 perfmodels_max_fpga_SOURCES=\
 	perfmodels/max_fpga.c
-
-perfmodels_max_riffa_SOURCES=\
-	perfmodels/max_riffa.c
+perfmodels_max_fpga_LDADD = $(LDADD) \
+	$(srcdir)/perfmodels/slic_StreamFMA.o
 
 if STARPU_USE_OPENCL
 perfmodels_regression_based_SOURCES+=\
 	perfmodels/opencl_memset.c
 
-perfmodels_regression_based_04_SOURCES+=\
-	perfmodels/opencl_memset_01.c
-
 nobase_STARPU_OPENCL_DATA_DATA += \
 	perfmodels/opencl_memset_kernel.cl
-
-nobase_STARPU_OPENCL_DATA_DATA += \
-	perfmodels/opencl_memset_kernel_01.cl
 endif
 
 perfmodels_non_linear_regression_based_SOURCES=\

+ 59 - 0
tests/perfmodels/LMemLoopbackCpuCode.c

@@ -0,0 +1,59 @@
+/**
+ * Document: MaxCompiler Tutorial (maxcompiler-tutorial.pdf)
+ * Chapter: 13      Example: 2      Name: LMem Loopback
+ * MaxFile name: LMemLoopback
+ * Summary:
+ *        Adds two LMem input streams and writes the result to LMem.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "LMemLoopback.h"
+#include <MaxSLiCInterface.h>
+
+int check(int size, int32_t *outData, int32_t *inA, int32_t *inB)
+{
+	int status = 0;
+	for (int i = 0; i < size; i++) {
+		if (outData[i] != inA[i] + inB[i]) {
+			fprintf(stderr, "[%d] Verification error, out: %u != expected: %u\n",
+				i, outData[i], inA[i] + inB[i]);
+			status = 1;
+		}
+	}
+	return status;
+}
+
+int main()
+{
+	const int size = 384;
+	int sizeBytes = size * sizeof(int32_t);
+	int32_t *inA = (int32_t*) malloc(sizeBytes);
+	int32_t *inB = (int32_t*) malloc(sizeBytes);
+
+	for (int i = 0; i < size; i++) {
+		inA[i] = i;
+		inB[i] = size - i;
+	}
+
+	printf("Loading DFE memory.\n");
+	LMemLoopback_writeLMem(size, 0, inA);
+	LMemLoopback_writeLMem(size, size, inB);
+
+	printf("Running DFE.\n");
+	LMemLoopback(size);
+
+	printf("Reading DFE memory.\n");
+	int32_t *outData = (int32_t*) malloc(sizeBytes);
+	LMemLoopback_readLMem(size, 2 * size, outData);
+
+	int status = check(size, outData, inA, inB);
+	if (status)
+		printf("Test failed.\n");
+	else
+		printf("Test passed OK!\n");
+
+	return status;
+}

+ 105 - 0
tests/perfmodels/LMemLoopbackMAX5CManager.maxj

@@ -0,0 +1,105 @@
+/**
+ * Document: MaxCompiler Tutorial (maxcompiler-tutorial.pdf)
+ * Chapter: 13      Example: 2      Name: LMem Loopback
+ * MaxFile name: LMemLoopback
+ * Summary:
+ *       Connects the Kernel's input / output streams to LMem. Also creates one
+ *  input and one output stream for accessing the LMem directly from the CPU
+ *  software.
+ */
+
+package perfmodels;
+
+import com.maxeler.maxcompiler.v2.build.EngineParameters;
+import com.maxeler.maxcompiler.v2.kernelcompiler.Kernel;
+import com.maxeler.maxcompiler.v2.managers.custom.DFELink;
+import com.maxeler.maxcompiler.v2.managers.custom.blocks.KernelBlock;
+import com.maxeler.maxcompiler.v2.managers.custom.stdlib.LMemCommandGroup;
+import com.maxeler.maxcompiler.v2.managers.custom.stdlib.LMemInterface;
+import com.maxeler.maxcompiler.v2.managers.engine_interfaces.CPUTypes;
+import com.maxeler.maxcompiler.v2.managers.engine_interfaces.EngineInterface;
+import com.maxeler.maxcompiler.v2.managers.engine_interfaces.EngineInterface.Direction;
+import com.maxeler.maxcompiler.v2.managers.engine_interfaces.InterfaceParam;
+import com.maxeler.platform.max5.manager.MAX5CManager;
+
+public class LMemLoopbackMAX5CManager extends MAX5CManager {
+
+	private static final CPUTypes TYPE = CPUTypes.INT32;
+
+	public LMemLoopbackMAX5CManager(EngineParameters params) {
+		super(params);
+
+		Kernel kernel = new LMemLoopbackKernel(makeKernelParameters("LMemLoopbackKernel"));
+		KernelBlock kernelBlock = addKernel(kernel);
+
+		LMemInterface iface = addLMemInterface();
+		DFELink cpu2lmem = iface.addStreamToLMem("cpu2lmem", LMemCommandGroup.MemoryAccessPattern.LINEAR_1D);
+		DFELink lmem2cpu = iface.addStreamFromLMem("lmem2cpu", LMemCommandGroup.MemoryAccessPattern.LINEAR_1D);
+
+		DFELink fromcpu = addStreamFromCPU("fromcpu");
+		DFELink tocpu = addStreamToCPU("tocpu");
+
+		cpu2lmem <== fromcpu;
+		tocpu <== lmem2cpu;
+
+		DFELink inA = iface.addStreamFromLMem("inA", LMemCommandGroup.MemoryAccessPattern.LINEAR_1D);
+		DFELink inB = iface.addStreamFromLMem("inB", LMemCommandGroup.MemoryAccessPattern.LINEAR_1D);
+
+		kernelBlock.getInput("inA") <== inA;
+		kernelBlock.getInput("inB") <== inB;
+
+		DFELink oData = iface.addStreamToLMem("oData", LMemCommandGroup.MemoryAccessPattern.LINEAR_1D);
+		oData <== kernelBlock.getOutput("oData");
+
+		createSlicInterface(interfaceDefault());
+		createSlicInterface(interfaceWrite("writeLMem"));
+		createSlicInterface(interfaceRead("readLMem"));
+	}
+
+	public static void main(String[] args) {
+		LMemLoopbackMAX5CManager manager = new LMemLoopbackMAX5CManager(new EngineParameters(args));
+		manager.build();
+	}
+
+	private static EngineInterface interfaceDefault() {
+		EngineInterface ei = new EngineInterface();
+
+		InterfaceParam N    = ei.addParam("N", TYPE);
+		InterfaceParam zero = ei.addConstant(0l);
+		ei.setTicks("LMemLoopbackKernel", N);
+		InterfaceParam sizeInBytes = N * TYPE.sizeInBytes();
+
+		ei.setLMemLinear("inA", zero, sizeInBytes);
+		ei.setLMemLinear("inB", sizeInBytes, sizeInBytes);
+		ei.setLMemLinear("oData", 2 * sizeInBytes, sizeInBytes);
+		ei.ignoreAll(Direction.IN_OUT);
+		return ei;
+	}
+
+	private static EngineInterface interfaceWrite(String name) {
+		EngineInterface ei = new EngineInterface(name);
+
+		InterfaceParam size  = ei.addParam("size", TYPE);
+		InterfaceParam start = ei.addParam("start", TYPE);
+		InterfaceParam sizeInBytes = size * TYPE.sizeInBytes();
+
+		ei.setStream("fromcpu", TYPE, sizeInBytes );
+		ei.setLMemLinear("cpu2lmem", start * TYPE.sizeInBytes(), sizeInBytes);
+		ei.ignoreAll(Direction.IN_OUT);
+		return ei;
+	}
+
+	private static EngineInterface interfaceRead(String name) {
+		EngineInterface ei = new EngineInterface(name);
+
+		InterfaceParam size  = ei.addParam("size", TYPE);
+		InterfaceParam start = ei.addParam("start", TYPE);
+		InterfaceParam sizeInBytes = size * TYPE.sizeInBytes();
+
+		ei.setLMemLinear("lmem2cpu", start * TYPE.sizeInBytes(), sizeInBytes);
+		ei.setStream("tocpu", TYPE, sizeInBytes);
+		ei.ignoreAll(Direction.IN_OUT);
+		return ei;
+	}
+
+}

文件差異過大導致無法顯示
+ 1478 - 1478
tests/perfmodels/StreamFMA.max


+ 26 - 0
tests/perfmodels/StreamFMAKernel.maxj

@@ -0,0 +1,26 @@
+package perfmodels;
+
+import com.maxeler.maxcompiler.v2.kernelcompiler.Kernel;
+import com.maxeler.maxcompiler.v2.kernelcompiler.KernelParameters;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEType;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEVar;
+
+class StreamFMAKernel extends Kernel {
+
+	private static final DFEType type = dfeInt(32);
+
+	protected StreamFMAKernel(KernelParameters parameters) {
+		super(parameters);
+
+		DFEVar a = io.input("a", type);
+		DFEVar b = io.input("b", type);
+                DFEVar c;
+		
+		c = a+b;
+
+		io.output("output", c, type);
+	}
+
+}
+
+

+ 24 - 0
tests/perfmodels/StreamFMAManager.maxj

@@ -0,0 +1,24 @@
+package perfmodels;
+
+import com.maxeler.maxcompiler.v2.build.EngineParameters;
+import com.maxeler.maxcompiler.v2.managers.custom.blocks.KernelBlock;
+import com.maxeler.platform.max5.manager.Max5LimaManager;
+
+class StreamFMAManager extends Max5LimaManager {
+
+	private static final String kernel_name = "StreamFMAKernel";
+
+	public StreamFMAManager(EngineParameters arg0) {
+		super(arg0);
+		KernelBlock kernel = addKernel(new StreamFMAKernel(makeKernelParameters(kernel_name)));
+		kernel.getInput("a") <== addStreamFromCPU("a");
+		kernel.getInput("b") <== addStreamFromCPU("b");
+		addStreamToCPU("output") <== kernel.getOutput("output");
+	}
+
+	public static void main(String[] args) {
+		StreamFMAManager manager = new StreamFMAManager(new EngineParameters(args));
+		manager.build();
+	}
+}
+

+ 89 - 45
tests/perfmodels/max_fpga.c

@@ -5,52 +5,60 @@
 #include <starpu_scheduler.h>
 #include "../helper.h"
 
-//#include "StreamFMA.h"
+#include "StreamFMA.max"
 #include "MaxSLiCInterface.h"
+#include "StreamFMA.h"
 
+#define SIZE 128
+
+static max_engine_t *engine ;
+static max_actions_t*act;
+static max_file_t *maxfile;
 
 void cpu_func(void *buffers[], void *cl_arg)
 {
+    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
+    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
+    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
+
+    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+
     (void)buffers;
     (void)cl_arg;
 
-    printf("///******Hello world**********//////\n");
+    int i;
+    for (i = 0; i < size; i++)
+	c[i] = a[i] + b[i];
 }
 
 void fpga_mult(void *buffers[], void *cl_arg)
 {   
-    (void)buffers;
     (void)cl_arg;
+    
+    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
+    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
+    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
 
-    const int size = 384;
-    int sizeBytes = size * sizeof(int32_t);
-    int32_t *a = (int32_t*) malloc(sizeBytes);
-    int32_t *b = (int32_t*) malloc(sizeBytes);
-    int32_t *c = (int32_t*) malloc(sizeBytes);
-
-    // TODO Generate input data
-    for(int i = 0; i < size; ++i)
-    {
-        a[i] = random() % 100;
-        b[i] = random() % 100;
-    }
-
-    //Implementation of a maxfile
-    //max_file_t *maxfile = StreamFMA_init();
-
-    //Implementation of an engine
-    //max_engine_t *engine = max_load(maxfile, "*");
+    int size = STARPU_VECTOR_GET_NX(buffers[0]);
 
     //Actions to run on an engine
-    //max_actions_t* act = max_actions_init(maxfile, NULL);
+     act = max_actions_init(maxfile, NULL);
 
     //set the number of ticks for a kernel
-    //max_set_ticks  (act, "StreamFMAKernel", size);
+    max_set_ticks  (act, "StreamFMAKernel", size);
 
+#if 1
+// sera remplacé par le transfert fait par le copy_fpga_to_ram/ram_to_fpga
     //add data to an input stream
-    //max_queue_input(act, "a", a, sizeBytes);
-    //max_queue_input(act, "b", b, sizeBytes);
-   //max_queue_output(act,"output", c, sizeBytes);
+    //max_set_param_uint64t(act, "address", 0);
+    //max_set_param_uint64t(act, "nbytes", size *sizeof(a[0]));
+    max_queue_input(act, "a", a, size *sizeof(a[0]));
+    
+    max_queue_input(act, "b", b, size*sizeof(b[0]));
+    max_queue_output(act,"output", c, size*sizeof(c[0]));
+
+// et à la place, ici on récupère le pointeur dans la mémoire FPGA avec STARPU_VECTOR_GET_PTR(descr[0]), et c'est ça qu'on donne à l'implémentation fpga
+#endif
 
     //run actions on the engine
     printf("Running on DFE using dynamic interface ...\n");
@@ -58,29 +66,24 @@ void fpga_mult(void *buffers[], void *cl_arg)
     printf("**** Run actions in non blocking mode **** \n");
 
     //run actions in non_blocking mode
-    //max_run_t *run0= max_run_nonblock(engine, act);
+    max_run_t *run0= max_run_nonblock(engine, act);
 
     printf("*** wait for the actions on DFE to complete *** \n");
     //wait for the actions to complete
-    //max_wait(run0);
-
-    //deallocate the set of actions
-    //max_actions_free(act);
-
-    //unload and deallocate an engine obtained by way of max_load
-    //max_unload(engine);
-}
+    max_wait(run0);
+    
+  }
 
 static struct starpu_codelet cl =
 {
     .cpu_funcs = {cpu_func},
     .cpu_funcs_name = {"cpu_func"},
-#ifdef STARPU_USE_FPGA
+//#ifdef STARPU_USE_FPGA
     .fpga_funcs = {fpga_mult},
     .fpga_funcs_name={"fpga_mult"},
-#endif
-    .nbuffers = 1,
-    .modes = {STARPU_W}
+//#endif
+    .nbuffers = 3,
+    .modes = {STARPU_R, STARPU_R, STARPU_W}
 };
 
 
@@ -91,7 +94,7 @@ int main(int argc, char **argv)
     starpu_profiling_status_set(1);
 
     struct starpu_conf conf;
-    starpu_data_handle_t handle;
+    starpu_data_handle_t handle_a, handle_b, handle_c;
     int ret;
     int size=1234;
 
@@ -104,20 +107,62 @@ int main(int argc, char **argv)
     if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-    starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+    //Implementation of a maxfile
+     maxfile = StreamFMA_init();
+
+    //Implementation of an engine
+    engine = max_load(maxfile, "*");
+
+    int a[SIZE];
+    int b[SIZE];
+    int c[SIZE];
+
+    int i;
+    for(i = 0; i < SIZE; ++i)
+    {
+        a[i] = random() % 100;
+        b[i] = random() % 100;
+    }
+
+    starpu_vector_data_register(&handle_a, STARPU_MAIN_RAM, (uintptr_t) &a, SIZE, sizeof(int));
+    starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(int));
+    starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(int));
+
     struct starpu_task *task = starpu_task_create();
     task->cl = &cl;
-    task->handles[0] = handle;
+    task->handles[0] = handle_a;
+    task->handles[1] = handle_b;
+    task->handles[2] = handle_c;
     
     task->synchronous = 1;
     task->destroy = 0;
     /* submit the task to StarPU */
 
     //starpu_task_destroy(task);
-    starpu_task_submit(task);
+    ret = starpu_task_submit(task);
+    
+    fprintf(stderr,"task submitted %d\n", ret);
 
-    starpu_data_unregister(handle);
+    starpu_data_unregister(handle_a);
+    starpu_data_unregister(handle_b);
+    starpu_data_unregister(handle_c);
   
+	int mysize = SIZE;
+	if (mysize > 10)
+		mysize = 10;
+	for (i = 0; i < mysize; ++i) 
+	{
+		printf("%d == %d\n", c[i], a[i] + b[i]);
+	}
+
+#if 1
+// -> main
+    //deallocate the set of actions
+    max_actions_free(act);
+
+    //unload and deallocate an engine obtained by way of max_load
+    max_unload(engine);
+#endif
 
     starpu_shutdown();
 
@@ -126,4 +171,3 @@ int main(int argc, char **argv)
 }
 
 
-

+ 0 - 131
tests/perfmodels/max_riffa.c

@@ -1,131 +0,0 @@
-#include <starpu.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <fpga.h>
-#include <starpu_scheduler.h>
-#include "../helper.h"
-
-
-void cpu_func(void *buffers[], void *cl_arg)
-{
-    (void)buffers;
-    (void)cl_arg;
-
-    printf("Hello world\n");
-}
-
-void fpga_mult(void *d[])
-{
-    /* Ask Fpga for a channel, or
-    * equivalently for a hardware task
-    */
-    int chnl = fpga_reserve_a_chanel();
-
-    /* Get inputs from STARPU */
-    int* subA = STARPU_MATRIX_GET_PTR(d[0]);
-    int* subB = STARPU_MATRIX_GET_PTR(d[1]);
-    int* subC = STARPU_MATRIX_GET_PTR(d[2]);
-
-    /* Get info on which part of the
-    * inputs the task must operate
-    */
-    uint32_t nyA= STARPU_MATRIX_GET_NY(d[0]);
-    uint32_t ldA= STARPU_MATRIX_GET_LD(d[0]);
-
-    uint32_t nyB= STARPU_MATRIX_GET_NY(d[1]);
-    uint32_t ldB= STARPU_MATRIX_GET_LD(d[1]);
-
-    uint32_t nyC= STARPU_MATRIX_GET_NY(d[2]);
-    uint32_t ldC= STARPU_MATRIX_GET_LD(d[2]);
-    uint32_t nxC= STARPU_MATRIX_GET_NX(d[2]);
-
-
-    /* Send A and B */
-    int buf_s[nyA], buf_r[nxC*nyC];
-
-    fpga_trans sent, recv;
-
-    for (uint32_t j = 0; j < nxC; j++)
-    {
-        for (uint32_t k = 0; k < nyA; k++)
-
-            buf_s[k] = subA[j+k*ldA];
-        fpga_data_send(chnl, buf_s, nyA);
-    }
-    for (uint32_t i = 0; i < nyC; i++)
-    {
-        for (uint32_t k = 0; k < nyA; k++)
-            buf_s[k] = subB[k+i*ldB];
-        fpga_data_send(chnl, buf_s, nyA);
-    }
-
-    /* Receive C. This is blocking */
-    fpga_data_recv(chnl, buf_r, nxC*nyC);
-    for (uint32_t i = 0; i < nxC; i++)
-    {
-        for (uint32_t j = 0; j < nyC; j++)
-            subC[j + i*ldC] = buf_r[i*nyC+j];
-    }
-    fpga_release_chanel(chnl);
-}
-
-static struct starpu_codelet cl =
-{
-    .cpu_funcs = {cpu_func},
-    .cpu_funcs_name = {"cpu_func"},
-    //.fpga_funcs = {fpga_mult},
-    .fpga_funcs = {fpga_mult},
-    .fpga_funcs_name={"fpga_mult"},
-    .nbuffers = 3,
-    .modes = {STARPU_R, STARPU_R, STARPU_W}
-};
-
-
-int main(int argc, char **argv)
-{
-
-    starpu_profiling_status_set(1);
-
-    struct starpu_conf conf;
-    starpu_data_handle_t A_handle, B_handle, C_handle;
-    int ret;
-
-    starpu_conf_init(&conf);
-
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-
-    /* initialize StarPU */
-    starpu_init(NULL);
-
-    for (uint32_t x = 0; x < 9; x++)
-    {
-        for (uint32_t y = 0; y < 9; y++)
-        {
-            struct starpu_task *task = starpu_task_create();
-            task->cl = &cl; /* Pointer to the codelet defined above */
-            /* Get handlers for each block */
-            task->handles[0] = starpu_data_get_sub_data( A_handle, 1, y);
-            task->handles[1] = starpu_data_get_sub_data( B_handle, 1, x);
-            task->handles[2] = starpu_data_get_sub_data( C_handle, 2, x, y);
-            /* submit the task to StarPU */
-            starpu_task_submit(task);
-        }
-    }
-
-    starpu_data_unregister(A_handle);
-    starpu_data_unregister(B_handle);
-    starpu_data_unregister(C_handle);
-
-    starpu_task_wait_for_all();
-
-
-    /* terminate StarPU */
-    starpu_shutdown();
-    return 0;
-}
-
-
-

+ 1 - 1
tests/perfmodels/regression_based_02.c

@@ -47,7 +47,7 @@ void memset0_cpu(void *descr[], void *arg)
     unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
     int i;
 
-    usleep(100);
+    //usleep(100);
 
     for (i=0; i<n ; i++)
     {

+ 1 - 1
tests/perfmodels/regression_based_03.c

@@ -50,7 +50,7 @@ void memset0_cpu(void *descr[], void *arg)
     int i;
 
     //usleep () function
-    usleep(100);
+    //usleep(100);
 
     for (i=0; i<n ; i++)
     {

+ 0 - 405
tests/perfmodels/regression_based_04.c

@@ -1,405 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2011,2012,2014                           Inria
- * Copyright (C) 2011-2016,2019                           Université de Bordeaux
- * Copyright (C) 2011-2017                                CNRS
- * Copyright (C) 2011                                     Télécom-SudParis
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-#include <starpu_scheduler.h>
-#include "../helper.h"
-
-/*
- * A multi-implementation benchmark with dmda scheduler
- * we aim to test OPENCL workers and calculate the estimated time for each type of worker (CPU or OPENCL or CUDA)
- * dmda choose OPENCL workers for lage size (variable size of compare_performance) size=1234567
- * dmda choose CPU workers for small size (size=1234)
- */
-
-#define STARTlin (512*1024)
-#define START 1024
-#ifdef STARPU_QUICK_CHECK
-#define END 1048576
-#else
-#define END 16777216
-#endif
-
-#ifdef STARPU_USE_CUDA
-
-static void memset_cuda(void *descr[], void *arg)
-{
-    (void)arg;
-    STARPU_SKIP_IF_VALGRIND;
-
-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
-
-    cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
-}
-
-#endif
-
-int ret;
-
-#ifdef STARPU_USE_OPENCL
-extern void memset0_opencl(void *buffers[], void *args);
-extern void memset_opencl(void *buffers[], void *args);
-#endif
-
-void memset0_cpu(void *descr[], void *arg)
-{
-    (void)arg;
-    STARPU_SKIP_IF_VALGRIND;
-
-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
-
-    //starpu_usleep(100);
-    unsigned i;
-
-    for (i = 0; i < n; i++)
-
-        ptr[0] += i;
-}
-
-void memset_cpu(void *descr[], void *arg)
-{
-    (void)arg;
-    STARPU_SKIP_IF_VALGRIND;
-
-    int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-    unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
-
-    //starpu_usleep(10);
-    memset(ptr, 42, n * sizeof(*ptr));
-}
-
-static struct starpu_perfmodel model =
-{
-    .type = STARPU_REGRESSION_BASED,
-    .symbol = "memset_regression_based"
-};
-
-static struct starpu_perfmodel nl_model =
-{
-    .type = STARPU_NL_REGRESSION_BASED,
-    .symbol = "non_linear_memset_regression_based"
-};
-
-static struct starpu_codelet memset_cl =
-{
-#ifdef STARPU_USE_CUDA
-    .cuda_funcs = {memset_cuda},
-    .cuda_flags = {STARPU_CUDA_ASYNC},
-#endif
-#ifdef STARPU_USE_OPENCL
-    .opencl_funcs = {memset0_opencl, memset_opencl},
-    .opencl_flags = {STARPU_OPENCL_ASYNC},
-#endif
-    .cpu_funcs = {memset0_cpu, memset_cpu},
-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
-    .model = &model,
-    .nbuffers = 1,
-    .modes = {STARPU_W}
-};
-
-static struct starpu_codelet nl_memset_cl =
-{
-#ifdef STARPU_USE_CUDA
-    .cuda_funcs = {memset_cuda},
-    .cuda_flags = {STARPU_CUDA_ASYNC},
-#endif
-#ifdef STARPU_USE_OPENCL
-    .opencl_funcs = {memset0_opencl, memset_opencl},
-    .opencl_flags = {STARPU_OPENCL_ASYNC},
-#endif
-    .cpu_funcs = {memset0_cpu, memset_cpu},
-    .cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
-    .model = &nl_model,
-    .nbuffers = 1,
-    .modes = {STARPU_W}
-};
-
-static void test_memset(int nelems, struct starpu_codelet *codelet)
-{
-    int nloops = 100;
-    int loop;
-    starpu_data_handle_t handle;
-
-    void *dummy_buffer = malloc(nelems*sizeof(int));
-    STARPU_ASSERT(dummy_buffer != NULL);
-    starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)dummy_buffer, nelems, sizeof(int));
-    for (loop = 0; loop < nloops; loop++)
-    {
-        struct starpu_task *task = starpu_task_create();
-
-        task->cl = codelet;
-        task->handles[0] = handle;
-
-        int ret = starpu_task_submit(task);
-        if (ret == -ENODEV)
-            exit(STARPU_TEST_SKIPPED);
-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-    }
-
-    starpu_data_unregister(handle);
-    free(dummy_buffer);
-}
-
-static void compare_performance(int size, struct starpu_codelet *codelet, struct starpu_task *task)
-{
-    unsigned i;
-    int niter = 100;
-    starpu_data_handle_t handle;
-
-    void *dummy_buffer = malloc(size*sizeof(int));
-    STARPU_ASSERT(dummy_buffer != NULL);
-    starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)dummy_buffer, size, sizeof(int));
-
-    struct starpu_task **tasks = (struct starpu_task **) malloc(niter*sizeof(struct starpu_task *));
-    assert(tasks);
-
-    for (i = 0; i < niter; i++)
-    {
-        //fabriquer la tache
-        struct starpu_task *task = starpu_task_create();
-
-        task->cl = codelet;
-        task->handles[0] = handle;
-
-        task->synchronous = 1;
-
-        /* We will destroy the task structure by hand so that we can
-         * query the profiling info before the task is destroyed. */
-        task->destroy = 0;
-
-        tasks[i] = task;
-
-        //soumettre la tache
-        ret = starpu_task_submit(task);
-
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-        {
-            FPRINTF(stderr, "No worker may execute this task\n");
-            exit(0);
-        }
-    }
-
-    starpu_data_unregister(handle);
-    free(dummy_buffer);
-
-    starpu_task_wait_for_all();
-
-    double length_cpu_sum = 0.0;
-    double length_gpu_sum = 0.0;
-
-    enum starpu_worker_archtype archi;
-
-    for (i = 0; i < niter; i++)
-    {
-        struct starpu_task *task = tasks[i];
-
-        struct starpu_profiling_task_info *info = task->profiling_info;
-
-        //archi=starpu_worker_get_type(0);
-        archi=starpu_worker_get_type(info->workerid);
-
-        switch (archi)
-        {
-        case STARPU_CPU_WORKER:
-            FPRINTF(stdout, "cpuuu\n");
-            /* How long was the task execution ? */
-            length_cpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
-            break;
-
-        case STARPU_OPENCL_WORKER:
-
-            FPRINTF(stdout, "openclllllll\n");
-            /* How long was the task execution ? */
-            length_gpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
-            break;
-
-        case STARPU_CUDA_WORKER:
-
-            FPRINTF(stdout, "cudaaaaaa\n");
-            /* How long was the task execution ? */
-            length_gpu_sum += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
-            break;
-
-        }
-
-        /* We don't need the task structure anymore */
-        starpu_task_destroy(task);
-
-    }
-
-    unsigned worker;
-
-    /* Display the occupancy of all workers during the test */
-    unsigned ncpus =  starpu_cpu_worker_get_count();
-    unsigned ngpus =  starpu_opencl_worker_get_count()+starpu_cuda_worker_get_count();
-    //unsigned ncpu= starpu_worker_get_count_by_type(STARPU_CPU_WORKER);
-
-    FPRINTF(stderr, "ncpus %u \n", ncpus);
-    FPRINTF(stderr, "ngpus %u \n", ngpus);
-    for (worker= 0; worker< starpu_worker_get_count(); worker++)
-    {
-
-        struct starpu_profiling_worker_info worker_info;
-        ret = starpu_profiling_worker_get_info(worker, &worker_info);
-        STARPU_ASSERT(!ret);
-
-        char workername[128];
-        starpu_worker_get_name(worker, workername, sizeof(workername));
-        unsigned nimpl;
-
-        FPRINTF(stdout, "\n Worker :%s ::::::::::\n\n", workername);
-
-        for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-        {
-            switch (starpu_worker_get_type(worker))
-
-            {
-            case STARPU_CPU_WORKER:
-
-                FPRINTF(stdout, "Expected time for %d on %s (impl %u): %f, Measured time: %f \n",
-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_cpu_sum)/niter));
-
-                break;
-
-            case STARPU_OPENCL_WORKER:
-
-                FPRINTF(stdout, "Expectedd time for %d on %s (impl %u): %f, Measuredd time: %f \n",
-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_gpu_sum)/niter));
-
-                break;
-
-            case STARPU_CUDA_WORKER:
-
-                FPRINTF(stdout, "Expectedd time for %d on %s (impl %u): %f, Measuredd time: %f \n",
-                        size, workername, nimpl,starpu_task_expected_length(task, starpu_worker_get_perf_archtype(worker, task->sched_ctx), nimpl), ((length_gpu_sum)/niter));
-
-                break;
-
-            }
-        }
-
-    }
-
-
-}
-
-#ifdef STARPU_USE_OPENCL
-struct starpu_opencl_program opencl_program;
-#endif
-
-int main(int argc, char **argv)
-{
-
-    /* Enable profiling */
-    starpu_profiling_status_set(1);
-
-    struct starpu_conf conf;
-    starpu_data_handle_t handle;
-    int ret;
-
-    starpu_conf_init(&conf);
-
-    conf.sched_policy_name = "dmda";
-    conf.calibrate = 2;
-
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-#ifdef STARPU_USE_OPENCL
-
-    ret = starpu_opencl_load_opencl_from_file("/home/makni/makni/starpu.git/tests/perfmodels/opencl_memset_kernel_01.cl",
-            &opencl_program, NULL);
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
-
-#endif
-
-
-    int size;
-    for (size = STARTlin; size < END; size *= 2)
-    {
-        /* Use a linear regression */
-        test_memset(size, &memset_cl);
-    }
-
-    for (size = START*1.5; size < END; size *= 2)
-    {
-        /* Use a non-linear regression */
-        test_memset(size, &nl_memset_cl);
-    }
-
-    ret = starpu_task_wait_for_all();
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
-
-    starpu_shutdown();
-
-
-    /* Test Phase */
-    starpu_conf_init(&conf);
-
-    conf.sched_policy_name = "dmda";
-    conf.calibrate = 0;
-
-    ret = starpu_initialize(&conf, &argc, &argv);
-    if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-    ret = starpu_opencl_load_opencl_from_file("/home/makni/makni/starpu.git/tests/perfmodels/opencl_memset_kernel_01.cl",
-            &opencl_program, NULL);
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
-
-    /* Now create a dummy task just to estimate its duration according to the regression */
-
-    size = 1234567;
-
-    void *dummy_buffer = malloc(size*sizeof(int));
-    STARPU_ASSERT(dummy_buffer != NULL);
-    starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)dummy_buffer, size, sizeof(int));
-
-    struct starpu_task *task = starpu_task_create();
-    task->cl = &memset_cl;
-    task->handles[0] = handle;
-    task->destroy = 0;
-
-    //FPRINTF(stdout, "\n ////linear regression results////\n");
-    //compare_performance(size, &memset_cl,task);
-
-    task->cl = &nl_memset_cl;
-
-    FPRINTF(stdout, "\n ////non linear regression results////\n");
-
-    compare_performance(size, &nl_memset_cl,task);
-
-    starpu_task_destroy(task);
-
-    starpu_data_unregister(handle);
-    free(dummy_buffer);
-
-#ifdef STARPU_USE_OPENCL
-
-    ret = starpu_opencl_unload_opencl(&opencl_program);
-    STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
-
-#endif
-
-    starpu_shutdown();
-
-    return EXIT_SUCCESS;
-}