ソースを参照

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into ft_checkpoint

Romain LION 5 年 前
コミット
0f946df112
共有100 個のファイルを変更した596 個の追加218 個の削除を含む
  1. 1 0
      ChangeLog
  2. 46 24
      configure.ac
  3. 4 2
      doc/doxygen/chapters/101_building.doxy
  4. 57 65
      doc/doxygen/chapters/320_scheduling.doxy
  5. 16 0
      doc/doxygen/chapters/410_mpi_support.doxy
  6. 1 1
      examples/mult/xgemm.c
  7. 0 0
      julia/Makefile
  8. 0 0
      julia/Manifest.toml
  9. 0 0
      julia/Project.toml
  10. 53 0
      julia/README
  11. 0 0
      julia/REQUIRE
  12. 0 0
      julia/examples/black_scholes/black_scholes.c
  13. 0 2
      julia/black_scholes/black_scholes.jl
  14. 28 17
      julia/mandelbrot/Makefile
  15. 0 0
      julia/examples/mandelbrot/cpu_mandelbrot.c
  16. 0 0
      julia/examples/mandelbrot/mandelbrot.c
  17. 12 22
      julia/mandelbrot/mandelbrot.jl
  18. 10 20
      julia/mandelbrot/mandelbrot_native.jl
  19. 28 16
      julia/mult/Makefile
  20. 0 0
      julia/examples/mult/README
  21. 0 0
      julia/examples/mult/cpu_mult.c
  22. 0 0
      julia/examples/mult/gpu_mult.cu
  23. 0 0
      julia/examples/mult/mult.c
  24. 43 44
      julia/mult/mult.jl
  25. 0 0
      julia/examples/mult/mult.plot
  26. 0 5
      julia/mult/mult_native.jl
  27. 0 0
      julia/examples/mult/res/mult_cstarpu_gcc9_s72_2x2_b4x2.dat
  28. 0 0
      julia/examples/mult/res/mult_gen_gcc9_1x4.dat
  29. 0 0
      julia/examples/mult/res/mult_gen_gcc9_4x1.dat
  30. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s100_4x1.dat
  31. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s50_4x1.dat
  32. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s64_16x16_b4x2.dat
  33. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s64_4x4_b4x2.dat
  34. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s64_8x1_b4x2.dat
  35. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s64_8x8_b4x2.dat
  36. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_16x18_b4x2.dat
  37. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_16x8_b4x2.dat
  38. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_2x2.dat
  39. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b4x2.dat
  40. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b4x4.dat
  41. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b8x2.dat
  42. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_4x1.dat
  43. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_4x4_b4x2.dat
  44. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s72_8x8_b4x2.dat
  45. 0 0
      julia/examples/mult/res/mult_gen_gcc9_s80_4x1.dat
  46. 0 0
      julia/examples/mult/res/mult_gen_icc_s72_2x1_b4x2.dat
  47. 0 0
      julia/examples/mult/res/mult_gen_icc_s72_4x4_b4x2.dat
  48. 0 0
      julia/examples/mult/res/mult_native.dat
  49. 0 0
      julia/examples/mult/res/mult_nogen_gcc9_s72_2x2_b2x2.dat
  50. 0 0
      julia/examples/mult/res/mult_nogen_gcc9_s72_2x2_b4x2.dat
  51. 0 0
      julia/examples/mult/res/mult_nogen_icc_s72-36_2x2_b4x2.dat
  52. 0 0
      julia/examples/mult/res/mult_nogen_icc_s72_2x2_b4x2.dat
  53. 0 0
      julia/examples/mult/res/mult_nogen_icc_s72x2_2x2_b4x2.dat
  54. 0 0
      julia/examples/old_examples/Makefile.mk
  55. 0 0
      julia/examples/old_examples/README
  56. 0 0
      julia/examples/old_examples/cpu_mult.c
  57. 0 0
      julia/examples/old_examples/gpu_mult.cu
  58. 0 0
      julia/examples/old_examples/includes/display.c
  59. 0 0
      julia/examples/old_examples/includes/display.h
  60. 0 0
      julia/examples/old_examples/includes/sorting.c
  61. 0 0
      julia/examples/old_examples/includes/sorting.h
  62. 0 0
      julia/examples/old_examples/mandelbrot/cpu_cuda_mandelbrot.jl
  63. 0 0
      julia/examples/old_examples/mandelbrot/cpu_mandelbrot.c
  64. 0 0
      julia/examples/old_examples/mandelbrot/cpu_mandelbrot_between.c
  65. 0 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
  66. 0 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
  67. 0 0
      julia/examples/old_examples/mandelbrot/mandelbrot.c
  68. 0 0
      julia/examples/old_examples/mandelbrot/mandelbrot.jl
  69. 0 0
      julia/examples/old_examples/mandelbrot/mandelbrot_between.c
  70. 0 0
      julia/examples/old_examples/mandelbrot/mandelbrot_def.jl
  71. 0 0
      julia/examples/old_examples/mandelbrot/mandelbrot_generated.jl
  72. 0 0
      julia/examples/old_examples/mult.c
  73. 0 0
      julia/examples/old_examples/mult/cpu_cuda_mult.jl
  74. 0 0
      julia/examples/old_examples/mult/cpu_mult.c
  75. 0 0
      julia/examples/old_examples/mult/gpu_mult.cu
  76. 0 0
      julia/examples/old_examples/mult/mult.c
  77. 0 0
      julia/examples/old_examples/mult/mult_def.jl
  78. 0 0
      julia/examples/old_examples/mult/mult_extern.jl
  79. 0 0
      julia/examples/old_examples/mult/mult_extern_graph.jl
  80. 0 0
      julia/examples/old_examples/mult/mult_generated.jl
  81. 0 0
      julia/examples/old_examples/mult/mult_generated_graph.jl
  82. 0 0
      julia/examples/old_examples/mult/mult_naive.jl
  83. 0 0
      julia/examples/old_examples/nbody/cpu_cuda_nbody.jl
  84. 0 0
      julia/examples/old_examples/nbody/cpu_nbody.c
  85. 0 0
      julia/examples/old_examples/nbody/cpu_nbody_between.c
  86. 0 0
      julia/examples/old_examples/nbody/gpu_nbody.cu
  87. 0 0
      julia/examples/old_examples/nbody/gpu_nbody_between.cu
  88. 0 0
      julia/examples/old_examples/nbody/nbody.c
  89. 0 0
      julia/examples/old_examples/nbody/nbody.jl
  90. 0 0
      julia/examples/old_examples/nbody/nbody_between.c
  91. 0 0
      julia/examples/old_examples/nbody/nbody_def.jl
  92. 0 0
      julia/examples/old_examples/nbody/nbody_display.jl
  93. 0 0
      julia/examples/old_examples/nbody/nbody_generated.jl
  94. 58 0
      julia/examples/variable/Makefile
  95. 38 0
      julia/examples/variable/variable.jl
  96. 26 0
      julia/examples/variable/variable_native.jl
  97. 58 0
      julia/examples/vector_scal/Makefile
  98. 42 0
      julia/examples/vector_scal/cpu_vector_scal.c
  99. 75 0
      julia/examples/vector_scal/vector_scal.jl
  100. 0 0
      julia/setenv.sh

+ 1 - 0
ChangeLog

@@ -18,6 +18,7 @@ StarPU 1.4.0 (git revision xxxx)
 ==============================================
 New features:
   * Fault tolerance support with starpu_task_ft_failed().
+  * Julia programming interface.
   * Add get_max_size method to data interfaces for applications using data with
     variable size to express their maximal potential size.
   * New offline tool to draw graph showing elapsed time between sent

+ 46 - 24
configure.ac

@@ -324,25 +324,43 @@ if test x$enable_simgrid = xyes ; then
 else
     DEFAULT_MPICC=mpicc
 fi
-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<path to mpicc>], [Path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
-AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<mpicc name or path to mpicc>], [Name or path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
+case $DEFAULT_MPICC in
+	/*) mpicc_path="$DEFAULT_MPICC" ;;
+	*)  AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH]) ;;
+esac
+# We test if the MPICC compiler exists
+if test ! -x $mpicc_path; then
+    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
+    mpicc_path=no
+fi
+
 AC_MSG_CHECKING(whether mpicc is available)
 AC_MSG_RESULT($mpicc_path)
 AC_SUBST(MPICC, $mpicc_path)
 
+if test x$mpicc_path != xno ; then
+    MPIPATH=$(dirname $mpicc_path):$PATH
+else
+    MPIPATH=$PATH
+fi
+
 #Check MPICXX/MPIC++
 if test x$enable_simgrid = xyes ; then
     DEFAULT_MPICXX=smpicxx
 else
     DEFAULT_MPICXX=mpicxx
 fi
-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<path to mpicxx>], [Path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
-AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<mpicxx name or path to mpicxx>], [Name or path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
+case $DEFAULT_MPICXX in
+	/*) mpicxx_path="$DEFAULT_MPICXX" ;;
+	*)  AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$MPIPATH]) ;;
+esac
 
 # try with mpic++ if mpicxx was not found
 if test x$mpicxx_path = xno ; then
     DEFAULT_MPICXX=mpic++
-    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$MPIPATH])
 fi
 
 # We test if the MPICXX/MPIC++ compiler exists
@@ -491,17 +509,16 @@ if test x$enable_mpi = xyes -a x$running_mpi_check = xyes ; then
     # Check if mpiexec is available
     if test x$enable_simgrid = xyes ; then
 	DEFAULT_MPIEXEC=smpirun
-        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]], [Path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
-	AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$simgrid_dir/bin:$PATH])
+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<name of smpirun or path to smpirun>]], [Name or path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
     else
 	DEFAULT_MPIEXEC=mpiexec
-	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<path to mpiexec>], [Path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
-	if test x$mpicc_path = x ; then
-	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$PATH])
-	else
-	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$(dirname $mpicc_path):$PATH])
-	fi
+	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<name of mpiexec or path to mpiexec>], [Name or path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
     fi
+
+    case $DEFAULT_MPIEXEC in
+	/*) mpiexec_path="$DEFAULT_MPIEXEC" ;;
+	*)  AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$MPIPATH])
+    esac
     AC_MSG_CHECKING(whether mpiexec is available)
     AC_MSG_RESULT($mpiexec_path)
 
@@ -2215,9 +2232,9 @@ AC_MSG_RESULT($nmaxbuffers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 		[how many buffers can be manipulated per task])
 
-AC_MSG_CHECKING(maximum number of nodes to use)
+AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
-			[maximum number of nodes])],
+			[maximum number of memory nodes per MPI rank])],
 			maxnodes=$enableval, maxnodes=0)
 
 if test x$maxnodes = x0 ; then
@@ -2489,16 +2506,19 @@ enable_build_fortran=no
 if test "x$enable_build_fortran_requested" = "xyes" ; then
    if test "x$FC" != "x"; then
    	if $FC --version|grep -q 'GNU Fortran'; then
-		 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
-     	         #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
-                 int dummy;
-                 #else
-                 #error GFortran too old, version >= 4.9.x needed, Fortran examples will not be built
-                 #endif
-                 ]],
+		 AC_LANG_PUSH([Fortran])
+		 OLD_FCFLAGS="$FCFLAGS"
+		 FCFLAGS="$FCFLAGS -cpp"
+		 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [[
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)
+#error GFortran too old, version >= 4.9.x needed, Fortran examples will not be built
+#endif
+]]
                  )],
                  [enable_build_fortran="yes"],
                  [enable_build_fortran="no"])
+		 FCFLAGS="$OLD_FCFLAGS"
+		 AC_LANG_POP([Fortran])
                  if test "$enable_build_fortran" = "no" ; then
                    AC_MSG_WARN([GFortran too old, version >= 4.9.x needed, Fortran examples will not be built])
                  fi
@@ -2541,8 +2561,10 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 					      else
 						      DEFAULT_MPIFORT=mpif90
 					      fi
-					      # nothing was specified: default value is used
-					      AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$(dirname $mpicc_path):$simgrid_dir/bin:$PATH])
+					      case $DEFAULT_MPIFORT in
+					      	/*) mpifort_path="$DEFAULT_MPIFORT" ;;
+					        *)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$MPIPATH])
+					      esac
 					      ])
 
 			# We test if the MPIFORT compiler exists

+ 4 - 2
doc/doxygen/chapters/101_building.doxy

@@ -43,8 +43,10 @@ can be used to install StarPU.
 The <c>hwloc</c> (http://www.open-mpi.org/software/hwloc) topology
 discovery library is not mandatory to use StarPU but strongly
 recommended.  It allows for topology aware scheduling, which improves
-performance.  <c>libhwloc</c> is available in major free operating system
-distributions, and for most operating systems.
+performance. <c>hwloc</c> is available in major free operating system
+distributions, and for most operating systems. Make sure to not only install
+a <c>hwloc</c> or <c>libhwloc</c> package, but also <c>hwloc-devel</c> or
+<c>libhwloc-dev</c> so as to have hwloc headers etc.
 
 If <c>libhwloc</c> is installed in a standard
 location, no option is required, it will be detected automatically,

+ 57 - 65
doc/doxygen/chapters/320_scheduling.doxy

@@ -39,33 +39,33 @@ STARPU_SCHED. For instance <c>export STARPU_SCHED=dmda</c> . Use <c>help</c> to
 get the list of available schedulers.
 
 
-<b>Non Performance Modelling Policies:</b>
+\subsection NonPerformanceModelingPolicies Non Performance Modelling Policies
 
-The <b>eager</b> scheduler uses a central task queue, from which all workers draw tasks
+- The <b>eager</b> scheduler uses a central task queue, from which all workers draw tasks
 to work on concurrently. This however does not permit to prefetch data since the scheduling
 decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
 
-The <b>random</b> scheduler uses a queue per worker, and distributes tasks randomly according to assumed worker
+- The <b>random</b> scheduler uses a queue per worker, and distributes tasks randomly according to assumed worker
 overall performance.
 
-The <b>ws</b> (work stealing) scheduler uses a queue per worker, and schedules
+- The <b>ws</b> (work stealing) scheduler uses a queue per worker, and schedules
 a task on the worker which released it by
 default. When a worker becomes idle, it steals a task from the most loaded
 worker.
 
-The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
+- The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
 a task on the worker which released it by
 default. When a worker becomes idle, it steals a task from neighbour workers. It
 also takes into account priorities.
 
-The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
+- The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
 priority specified by the programmer (between -5 and 5).
 
-The <b>heteroprio</b> scheduler uses different priorities for the different processing units.
+- The <b>heteroprio</b> scheduler uses different priorities for the different processing units.
 This scheduler must be configured to work correclty and to expect high-performance
 as described in the corresponding section.
 
-\section DMTaskSchedulingPolicy Performance Model-Based Task Scheduling Policies
+\subsection DMTaskSchedulingPolicy Performance Model-Based Task Scheduling Policies
 
 If (<b>and only if</b>) your application <b>codelets have performance models</b> (\ref
 PerformanceModelExample), you should change the scheduler thanks to the
@@ -87,47 +87,84 @@ family policy using performance model hints. A low or zero percentage may be
 the sign that performance models are not converging or that codelets do not
 have performance models enabled.
 
-<b>Performance Modelling Policies:</b>
-
-The <b>dm</b> (deque model) scheduler takes task execution performance models into account to
+- The <b>dm</b> (deque model) scheduler takes task execution performance models into account to
 perform a HEFT-similar scheduling strategy: it schedules tasks where their
 termination time will be minimal. The difference with HEFT is that <b>dm</b>
 schedules tasks as soon as they become available, and thus in the order they
 become available, without taking priorities into account.
 
-The <b>dmda</b> (deque model data aware) scheduler is similar to dm, but it also takes
+- The <b>dmda</b> (deque model data aware) scheduler is similar to dm, but it also takes
 into account data transfer time.
 
-The <b>dmdap</b> (deque model data aware prio) scheduler is similar to dmda,
+- The <b>dmdap</b> (deque model data aware prio) scheduler is similar to dmda,
 except that it sorts tasks by priority order, which allows to become even closer
 to HEFT by respecting priorities after having made the scheduling decision (but
 it still schedules tasks in the order they become available).
 
-The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
+- The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
 but it also privileges tasks whose data buffers are already available
 on the target device.
 
-The <b>dmdas</b> combines dmdap and dmdas: it sorts tasks by priority order,
+- The <b>dmdas</b> combines dmdap and dmdas: it sorts tasks by priority order,
 but for a given priority it will privilege tasks whose data buffers are already
 available on the target device.
 
-The <b>dmdasd</b> (deque model data aware sorted decision) scheduler is similar
+- The <b>dmdasd</b> (deque model data aware sorted decision) scheduler is similar
 to dmdas, except that when scheduling a task, it takes into account its priority
 when computing the minimum completion time, since this task may get executed
 before others, and thus the latter should be ignored.
 
-The <b>heft</b> (heterogeneous earliest finish time) scheduler is a deprecated
+- The <b>heft</b> (heterogeneous earliest finish time) scheduler is a deprecated
 alias for <b>dmda</b>.
 
-The <b>pheft</b> (parallel HEFT) scheduler is similar to dmda, it also supports
+- The <b>pheft</b> (parallel HEFT) scheduler is similar to dmda, it also supports
 parallel tasks (still experimental). Should not be used when several contexts using
 it are being executed simultaneously.
 
-The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
+- The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
 supports parallel tasks (still experimental). Should not be used when several 
 contexts using it are being executed simultaneously.
 
-TODO: describe modular schedulers
+\subsection ExistingModularizedSchedulers Modularized Schedulers
+
+StarPU provides a powerful way to implement schedulers, as documented in \ref
+DefiningANewModularSchedulingPolicy . It is currently shipped with the following
+pre-defined Modularized Schedulers :
+
+
+- <b>modular-eager</b> , <b>modular-eager-prefetching</b> are eager-based Schedulers (without and with prefetching)), they are \n
+naive schedulers, which try to map a task on the first available resource
+they find. The prefetching variant queues several tasks in advance to be able to
+do data prefetching. This may however degrade load balancing a bit.
+
+- <b>modular-prio</b>, <b>modular-prio-prefetching</b>, <b>modular-eager-prio</b> are prio-based Schedulers (without / with prefetching):,
+similar to Eager-Based Schedulers. Can handle tasks which have a defined
+priority and schedule them accordingly.
+The <b>modular-eager-prio</b> variant integrates the eager and priority queue in a
+single component. This allows it to do a better job at pushing tasks.
+
+- <b>modular-random</b>, <b>modular-random-prio</b>, <b>modular-random-prefetching</b>, <b>modular-random-prio-prefetching</b> are random-based Schedulers (without/with prefetching) : \n
+Select randomly a resource to be mapped on for each task.
+
+- <b>modular-ws</b>) implements Work Stealing:
+Maps tasks to workers in round robin, but allows workers to steal work from other workers.
+
+- <b>modular-heft</b>, <b>modular-heft2</b>, and <b>modular-heft-prio</b> are
+HEFT Schedulers : \n
+Maps tasks to workers using a heuristic very close to
+Heterogeneous Earliest Finish Time.
+It needs that every task submitted to StarPU have a
+defined performance model (\ref PerformanceModelCalibration)
+to work efficiently, but can handle tasks without a performance
+model. <b>modular-heft</b> just takes tasks by priority order. <b>modular-heft2</b> takes
+at most 5 tasks of the same priority and checks which one fits best.
+<b>modular-heft-prio</b> is similar to <b>modular-heft</b>, but only decides the memory
+node, not the exact worker, just pushing tasks to one central queue per memory
+node.
+
+- <b>modular-heteroprio</b> is a Heteroprio Scheduler: \n
+Maps tasks to worker similarly to HEFT, but first attribute accelerated tasks to
+GPUs, then not-so-accelerated tasks to CPUs.
 
 \section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
 
@@ -198,51 +235,6 @@ use starpu_task_expected_length() on the task (in µs), multiplied by the
 typical power consumption of the device, e.g. in W, and divided by 1000000. to
 get Joules.
 
-\section ExistingModularizedSchedulers Modularized Schedulers
-
-StarPU provides a powerful way to implement schedulers, as documented in \ref
-DefiningANewModularSchedulingPolicy . It is currently shipped with the following
-pre-defined Modularized Schedulers :
-
-- Eager-based Schedulers (with/without prefetching : \c modular-eager ,
-\c modular-eager-prefetching) : \n
-Naive scheduler, which tries to map a task on the first available resource
-it finds. The prefecthing variant queues several tasks in advance to be able to
-do data prefetching. This may however degrade load balancing a bit.
-
-- Prio-based Schedulers (with/without prefetching :
-\c modular-prio, \c modular-prio-prefetching , \c modular-eager-prio) : \n
-Similar to Eager-Based Schedulers. Can handle tasks which have a defined
-priority and schedule them accordingly.
-The \c modular-eager-prio variant integrates the eager and priority queue in a
-single component. This allows it to do a better job at pushing tasks.
-
-- Random-based Schedulers (with/without prefetching: \c modular-random,
-\c modular-random-prio, \c modular-random-prefetching, \c
-modular-random-prio-prefetching) : \n
-Selects randomly a resource to be mapped on for each task.
-
-- Work Stealing (\c modular-ws) : \n
-Maps tasks to workers in round robin, but allows workers to steal work from other workers.
-
-- HEFT Scheduler : \n
-Maps tasks to workers using a heuristic very close to
-Heterogeneous Earliest Finish Time.
-It needs that every task submitted to StarPU have a
-defined performance model (\ref PerformanceModelCalibration)
-to work efficiently, but can handle tasks without a performance
-model. \c modular-heft just takes tasks by priority order. \c modular-heft takes
-at most 5 tasks of the same priority and checks which one fits best. \c
-modular-heft-prio is similar to \c modular-heft, but only decides the memory
-node, not the exact worker, just pushing tasks to one central queue per memory
-node.
-
-- Heteroprio Scheduler: \n
-Maps tasks to worker similarly to HEFT, but first attribute accelerated tasks to
-GPUs, then not-so-accelerated tasks to CPUs.
-
-To use one of these schedulers, one can set the environment variable \ref STARPU_SCHED.
-
 \section StaticScheduling Static Scheduling
 
 In some cases, one may want to force some scheduling, for instance force a given

+ 16 - 0
doc/doxygen/chapters/410_mpi_support.doxy

@@ -34,6 +34,22 @@ An MPI Insert Task function provides an even more seamless transition to a
 distributed application, by automatically issuing all required data transfers
 according to the task graph and an application-provided distribution.
 
+\section MPIBuild Building with MPI support
+
+If a <c>mpicc</c> compiler is already in your PATH, StarPU will automatically
+enable MPI support in the build. If <c>mpicc</c> is not in PATH, you
+can specify its location by passing <c>--with-mpicc=/where/there/is/mpicc</c> to
+<c>./configure</c>
+
+It can be useful to enable MPI tests during <c>make check</c> by passing
+<c>--enable-mpi-check</c> to <c>./configure</c>. And similarly to
+<c>mpicc</c>, if <c>mpiexec</c> in not in PATH, you can specify its location by passing
+<c>--with-mpiexec=/where/there/is/mpiexec</c> to <c>./configure</c>, but this is
+not needed if it is next to <c>mpicc</c>, configure will look there in addition to PATH.
+
+Similarly, Fortran examples use <c>mpif90</c>, which can be specified manually
+with <c>--with-mpifort</c> if it can't be found automatically.
+
 \section ExampleDocumentation Example Used In This Documentation
 
 The example below will be used as the base for this documentation. It

+ 1 - 1
examples/mult/xgemm.c

@@ -334,7 +334,7 @@ static void parse_args(int argc, char **argv)
 		}
 		else
 		{
-			fprintf(stderr,"Unrecognized option %s", argv[i]);
+			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
 			exit(EXIT_FAILURE);
 		}
 	}

julia/StarPU.jl/Makefile → julia/Makefile


julia/StarPU.jl/Manifest.toml → julia/Manifest.toml


julia/StarPU.jl/Project.toml → julia/Project.toml


+ 53 - 0
julia/README

@@ -0,0 +1,53 @@
+Contents
+========
+
+* Installing Julia
+* Installing StarPU module for Julia
+* Running Examples
+
+Installing Julia
+----------------
+Julia version 1.3+ is required and can be downloaded from
+https://julialang.org/downloads/.
+
+
+Installing StarPU module for Julia
+----------------------------------
+First, build the jlstarpu_c_wrapper library:
+
+$ make
+
+Then, you need to add the lib/ directory to your library path and the julia/
+directory to your Julia load path:
+
+$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib
+$ export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
+
+This step can also be done by sourcing the setenv.sh script:
+
+$ . setenv.sh
+
+Running Examples
+----------------
+
+You can find several examples in the examples/ directory.
+
+For each example X, three versions are provided:
+
+- X.c: Original C+starpu code
+- X_native.jl: Native Julia version (without StarPU)
+- X.jl: Julia version using StarPU
+
+
+To run the original C+StarPU code:
+$ make cstarpu.dat
+
+To run the native Julia version:
+$ make julia_native.dat
+
+To run the Julia version using StarPU:
+$ make julia_generatedc.dat
+
+
+
+

julia/StarPU.jl/REQUIRE → julia/REQUIRE


julia/black_scholes/black_scholes.c → julia/examples/black_scholes/black_scholes.c


+ 0 - 2
julia/black_scholes/black_scholes.jl

@@ -115,8 +115,6 @@ using StarPU
     return 0
 end
 
-
-@debugprint "starpu_init"
 starpu_init()
 
 function black_scholes_starpu(data ::Matrix{Float64}, res ::Matrix{Float64}, nslices ::Int64)

+ 28 - 17
julia/mandelbrot/Makefile

@@ -1,38 +1,49 @@
 CC=gcc
-CFLAGS += -Wall -Wextra -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
+
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
 
-LDFLAGS +=$(shell pkg-config --libs starpu-1.3) -lm
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
-#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-OBJECTS=$(wildcard gen*.c)
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
 LIBPATH=${PWD}/../StarPU.jl/lib
 
 all: ${EXTERNLIB}
 
 mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-gpu_mandelbrot.o: gpu_mandelbrot.cu
-	nvcc -c $(CFLAGS) $^ -o $@
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: cpu_mandelbrot.c
 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
-gpu_mandelbrot.so: gpu_mandelbrot.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
 
-cpu_mandelbrot_sa: cpu_mandelbrot_sa.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+.PHONY: clean
 
 clean:
-	rm -f mandelbrot *.so *.o c_*.genc gencuda_*.cu *.dat
+	rm -f mandelbrot *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: mandelbrot

julia/mandelbrot/cpu_mandelbrot.c → julia/examples/mandelbrot/cpu_mandelbrot.c


julia/mandelbrot/mandelbrot.c → julia/examples/mandelbrot/mandelbrot.c


+ 12 - 22
julia/mandelbrot/mandelbrot.jl

@@ -3,7 +3,7 @@ using StarPU
 using LinearAlgebra
 
 @target STARPU_CPU+STARPU_CUDA
-@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Float32
+@codelet function mandelbrot(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64, offset ::Int64, dim ::Int64 ) :: Nothing
     height :: Int64 = height(pixels)
     width :: Int64 = width(pixels)
     zoom :: Float64 = width * 0.25296875
@@ -11,10 +11,6 @@ using LinearAlgebra
     diverge :: Float32 = 4.0
     max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
     imi :: Float32 = 1. / max_iterations
-    centerr :: Float32 = params[1,1]
-    centeri :: Float32 = params[2,1]
-    offset :: Float32 = params[3,1]
-    dim :: Float32 = params[4,1]
     cr :: Float64 = 0.
     zr :: Float64 = 0.
     ci :: Float64 = 0.
@@ -27,7 +23,10 @@ using LinearAlgebra
             zr = cr
             ci = centeri + (y-1+offset - (dim / 2)) * iz
             zi = ci
-            for n = 0:max_iterations
+            max_it :: Float64 = max_iterations
+            n = 0
+            for i = 0:max_it
+                n = i
                 if (zr*zr + zi*zi > diverge)
                     break
                 end
@@ -43,21 +42,20 @@ using LinearAlgebra
             end
         end
     end
-    return 0. :: Float32
+
+    return
 end
 
-@debugprint "starpu_init"
 starpu_init()
 
-function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
+function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
     horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
     @starpu_block let
-	hA, hP = starpu_data_register(A,params)
+	hA = starpu_data_register(A)
 	starpu_data_partition(hA,horiz)
-        starpu_data_partition(hP,horiz)
-        
+
 	@starpu_sync_tasks for taskx in (1 : nslicesx)
-                @starpu_async_cl mandelbrot(hA[taskx], hP[taskx]) [STARPU_W, STARPU_R]
+                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] [cr, ci, (taskx-1)*dim/nslicesx, dim]
 	end
     end
 end
@@ -79,16 +77,9 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
     tmin=0;
     
     pixels ::Matrix{Int64} = zeros(dim, dim)
-    params :: Matrix{Float32} = zeros(4*nslices,1)
-    for i=0:(nslices-1)
-        params[4*i+1,1] = cr
-        params[4*i+2,1] = ci
-        params[4*i+3,1] = i*dim/nslices
-        params[4*i+4,1] = dim
-    end
     for i = 1:10
         t = time_ns();
-        mandelbrot_with_starpu(pixels, params, nslices)
+        mandelbrot_with_starpu(pixels, cr, ci, dim, nslices)
         t = time_ns()-t
         if (tmin==0 || tmin>t)
             tmin=t
@@ -109,6 +100,5 @@ end
 
 display_time(-0.800671,-0.158392,32,32,4096,4)
 
-@debugprint "starpu_shutdown"
 starpu_shutdown()
 

+ 10 - 20
julia/mandelbrot/mandelbrot_native.jl

@@ -1,16 +1,12 @@
 using LinearAlgebra
 
-function mandelbrot(pixels, params) :: Float32
+function mandelbrot(pixels, centerr ::Float64, centeri ::Float64, offset ::Int64, dim ::Int64) :: Nothing
     height :: Int64, width :: Int64 = size(pixels)
     zoom :: Float64 = width * 0.25296875
     iz :: Float64 = 1. / zoom
     diverge :: Float32 = 4.0
     max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
-    imi :: Float32 = 1. / max_iterations
-    centerr :: Float32 = params[1]
-    centeri :: Float32 = params[2]
-    offset :: Float32 = params[3]
-    dim :: Float32 = params[4]
+    imi :: Float64 = 1. / max_iterations
     cr :: Float64 = 0.
     zr :: Float64 = 0.
     ci :: Float64 = 0.
@@ -23,7 +19,9 @@ function mandelbrot(pixels, params) :: Float32
             zr = cr
             ci = centeri + (y-1+offset - (dim / 2)) * iz
             zi = ci
-            for n = 0:max_iterations
+            n = 0
+            for i = 0:max_iterations
+                n = i
                 if (zr*zr + zi*zi > diverge)
                     break
                 end
@@ -40,11 +38,10 @@ function mandelbrot(pixels, params) :: Float32
         end
     end
 
-    ret :: Float32 = 0.
-    return ret
+    return
 end
 
-function mandelbrot_without_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
+function mandelbrot_without_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
     width,height = size(A)
     step = height / nslicesx
 
@@ -52,9 +49,9 @@ function mandelbrot_without_starpu(A ::Matrix{Int64}, params ::Matrix{Float32},
         start_id = floor(Int64, (taskx-1)*step+1)
         end_id = floor(Int64, (taskx-1)*step+step)
         a = view(A, start_id:end_id, :)
-        p = view(params, (taskx-1)*4+1:(taskx-1)*4+4)
 
-        mandelbrot(a, p)
+        offset ::Int64 = (taskx-1)*dim/nslicesx
+        mandelbrot(a, cr, ci, offset, dim)
     end
 end
 
@@ -75,16 +72,9 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
     tmin=0;
 
     pixels ::Matrix{Int64} = zeros(dim, dim)
-    params :: Matrix{Float32} = zeros(4*nslices,1)
-    for i=0:(nslices-1)
-        params[4*i+1,1] = cr
-        params[4*i+2,1] = ci
-        params[4*i+3,1] = i*dim/nslices
-        params[4*i+4,1] = dim
-    end
     for i = 1:10
         t = time_ns();
-        mandelbrot_without_starpu(pixels, params, nslices)
+        mandelbrot_without_starpu(pixels, cr, ci, dim, nslices)
         t = time_ns()-t
         if (tmin==0 || tmin>t)
             tmin=t

+ 28 - 16
julia/mult/Makefile

@@ -6,40 +6,52 @@ STRIDE=72
 #CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
 # GCC compiler
 CC=gcc
-CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
 
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g -DSTRIDE=${STRIDE} $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
-#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-OBJECTS=$(wildcard gen*.c)
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
+
 LIBPATH=${PWD}/../StarPU.jl/lib
 
 all: ${EXTERNLIB}
 
 mult: mult.c cpu_mult.o #gpu_mult.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-gpu_mult.o: gpu_mult.cu
-	nvcc -c $(CFLAGS) $^ -o $@
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: cpu_mult.c
 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
-gpu_mult.so: gpu_mult.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
 
-cpu_mult_sa: cpu_mult_sa.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+.PHONY: clean
 
 clean:
-	rm -f mult *.so *.o c_*.genc gencuda_*.cu *.dat
+	rm -f mult *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: mult

julia/mult/README → julia/examples/mult/README


julia/mult/cpu_mult.c → julia/examples/mult/cpu_mult.c


julia/mult/gpu_mult.cu → julia/examples/mult/gpu_mult.cu


julia/mult/mult.c → julia/examples/mult/mult.c


+ 43 - 44
julia/mult/mult.jl

@@ -6,58 +6,57 @@ using LinearAlgebra
 const STRIDE = 72
 
 @target STARPU_CPU+STARPU_CUDA
-@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Float32
+@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
 
     width_m2 :: Int32 = width(m2)
     height_m1 :: Int32 = height(m1)
     width_m1 :: Int32 = width(m1)
     # Naive version
-    #@parallel for j in (1 : width_m2)
-    #    @parallel for i in (1 : height_m1)
-    #
-    #          sum :: Float32 = 0.
-
-    #          for k in (1 : width_m1)
-    #              sum = sum + m1[i, k] * m2[k, j]
-    #          end
+    @parallel for j in (1 : width_m2)
+       @parallel for i in (1 : height_m1)
     
-    #          m3[i, j] = sum
-    #      end
-    #  end
-    ##### Tiled and unrolled version 
-    for l in (1 : width_m2)
-        for m in (1 : height_m1)
-            m3[m,l] = 0
-        end
-    end
-    @parallel for i in (1 : STRIDE : height_m1)
-        for k in (1 : STRIDE : width_m1 )
-            for j in (1 : STRIDE : width_m2  )
-                for kk in (k : 4 : k+STRIDE-1)
-                    for jj in (j : 2 : j+STRIDE-1)
-                        alpha00 :: Float32 =m2[kk,jj]
-                        alpha01 :: Float32 =m2[kk,jj+1]
-                        alpha10 :: Float32 =m2[kk+1,jj]
-                        alpha11 :: Float32 =m2[kk+1,jj+1]
-                        alpha20 :: Float32 =m2[kk+2,jj]
-                        alpha21 :: Float32 =m2[kk+2,jj+1]
-                        alpha30 :: Float32 =m2[kk+3,jj]
-                        alpha31 :: Float32 =m2[kk+3,jj+1]
-                        for ii in (i : 1 : i+STRIDE-1) 
-                            m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
-                            m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
-                        end
-                    end
-                end
-            end
-        end
-    end
+             sum :: Float32 = 0.
 
-    return 0. :: Float32
+             for k in (1 : width_m1)
+                 sum = sum + m1[i, k] * m2[k, j]
+             end
+    
+             m3[i, j] = sum
+         end
+     end
+    # ##### Tiled and unrolled version 
+    # for l in (1 : width_m2)
+    #     for m in (1 : height_m1)
+    #         m3[m,l] = 0
+    #     end
+    # end
+    # @parallel for i in (1 : STRIDE : height_m1)
+    #     for k in (1 : STRIDE : width_m1 )
+    #         for j in (1 : STRIDE : width_m2  )
+    #             for kk in (k : 4 : k+STRIDE-1)
+    #                 for jj in (j : 2 : j+STRIDE-1)
+    #                     alpha00 :: Float32 =m2[kk,jj]
+    #                     alpha01 :: Float32 =m2[kk,jj+1]
+    #                     alpha10 :: Float32 =m2[kk+1,jj]
+    #                     alpha11 :: Float32 =m2[kk+1,jj+1]
+    #                     alpha20 :: Float32 =m2[kk+2,jj]
+    #                     alpha21 :: Float32 =m2[kk+2,jj+1]
+    #                     alpha30 :: Float32 =m2[kk+3,jj]
+    #                     alpha31 :: Float32 =m2[kk+3,jj+1]
+    #                     for ii in (i : 1 : i+STRIDE-1) 
+    #                         m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
+    #                         m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
+    #                     end
+    #                 end
+    #             end
+    #         end
+    #     end
+    # end
+
+    return
 end
 
 
-@debugprint "starpu_init"
 starpu_init()
 
 function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
@@ -77,7 +76,7 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
         )
         cl = StarpuCodelet(
             cpu_func = CPU_CODELETS["matrix_mult"],
-            #cuda_func = "matrix_mult",
+            # cuda_func = CUDA_CODELETS["matrix_mult"],
             #opencl_func="ocl_matrix_mult",
             modes = [STARPU_R, STARPU_R, STARPU_W],
             perfmodel = perfmodel
@@ -141,6 +140,6 @@ end
 io=open(ARGS[1],"w")
 compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
 close(io)
-@debugprint "starpu_shutdown"
+
 starpu_shutdown()
 

julia/mult/mult.plot → julia/examples/mult/mult.plot


+ 0 - 5
julia/mult/mult_native.jl

@@ -5,9 +5,6 @@ using LinearAlgebra
 #shoud be the same as in the makefile
 const STRIDE = 72
 
-@debugprint "starpu_init"
-starpu_init()
-
 function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
     tmin = 0
     for i in (1 : 10 )
@@ -39,6 +36,4 @@ end
 io=open(ARGS[1],"w")
 compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
 close(io)
-@debugprint "starpu_shutdown"
-starpu_shutdown()
 

julia/mult/res/mult_cstarpu_gcc9_s72_2x2_b4x2.dat → julia/examples/mult/res/mult_cstarpu_gcc9_s72_2x2_b4x2.dat


julia/mult/res/mult_gen_gcc9_1x4.dat → julia/examples/mult/res/mult_gen_gcc9_1x4.dat


julia/mult/res/mult_gen_gcc9_4x1.dat → julia/examples/mult/res/mult_gen_gcc9_4x1.dat


julia/mult/res/mult_gen_gcc9_s100_4x1.dat → julia/examples/mult/res/mult_gen_gcc9_s100_4x1.dat


julia/mult/res/mult_gen_gcc9_s50_4x1.dat → julia/examples/mult/res/mult_gen_gcc9_s50_4x1.dat


julia/mult/res/mult_gen_gcc9_s64_16x16_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s64_16x16_b4x2.dat


julia/mult/res/mult_gen_gcc9_s64_4x4_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s64_4x4_b4x2.dat


julia/mult/res/mult_gen_gcc9_s64_8x1_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s64_8x1_b4x2.dat


julia/mult/res/mult_gen_gcc9_s64_8x8_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s64_8x8_b4x2.dat


julia/mult/res/mult_gen_gcc9_s72_16x18_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s72_16x18_b4x2.dat


julia/mult/res/mult_gen_gcc9_s72_16x8_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s72_16x8_b4x2.dat


julia/mult/res/mult_gen_gcc9_s72_2x2.dat → julia/examples/mult/res/mult_gen_gcc9_s72_2x2.dat


julia/mult/res/mult_gen_gcc9_s72_2x2_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b4x2.dat


julia/mult/res/mult_gen_gcc9_s72_2x2_b4x4.dat → julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b4x4.dat


julia/mult/res/mult_gen_gcc9_s72_2x2_b8x2.dat → julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b8x2.dat


julia/mult/res/mult_gen_gcc9_s72_4x1.dat → julia/examples/mult/res/mult_gen_gcc9_s72_4x1.dat


julia/mult/res/mult_gen_gcc9_s72_4x4_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s72_4x4_b4x2.dat


julia/mult/res/mult_gen_gcc9_s72_8x8_b4x2.dat → julia/examples/mult/res/mult_gen_gcc9_s72_8x8_b4x2.dat


julia/mult/res/mult_gen_gcc9_s80_4x1.dat → julia/examples/mult/res/mult_gen_gcc9_s80_4x1.dat


julia/mult/res/mult_gen_icc_s72_2x1_b4x2.dat → julia/examples/mult/res/mult_gen_icc_s72_2x1_b4x2.dat


julia/mult/res/mult_gen_icc_s72_4x4_b4x2.dat → julia/examples/mult/res/mult_gen_icc_s72_4x4_b4x2.dat


julia/mult/res/mult_native.dat → julia/examples/mult/res/mult_native.dat


julia/mult/res/mult_nogen_gcc9_s72_2x2_b2x2.dat → julia/examples/mult/res/mult_nogen_gcc9_s72_2x2_b2x2.dat


julia/mult/res/mult_nogen_gcc9_s72_2x2_b4x2.dat → julia/examples/mult/res/mult_nogen_gcc9_s72_2x2_b4x2.dat


julia/mult/res/mult_nogen_icc_s72-36_2x2_b4x2.dat → julia/examples/mult/res/mult_nogen_icc_s72-36_2x2_b4x2.dat


julia/mult/res/mult_nogen_icc_s72_2x2_b4x2.dat → julia/examples/mult/res/mult_nogen_icc_s72_2x2_b4x2.dat


julia/mult/res/mult_nogen_icc_s72x2_2x2_b4x2.dat → julia/examples/mult/res/mult_nogen_icc_s72x2_2x2_b4x2.dat


julia/tst/Makefile.mk → julia/examples/old_examples/Makefile.mk


julia/tst/README → julia/examples/old_examples/README


julia/tst/cpu_mult.c → julia/examples/old_examples/cpu_mult.c


julia/tst/gpu_mult.cu → julia/examples/old_examples/gpu_mult.cu


julia/tst/includes/display.c → julia/examples/old_examples/includes/display.c


julia/tst/includes/display.h → julia/examples/old_examples/includes/display.h


julia/tst/includes/sorting.c → julia/examples/old_examples/includes/sorting.c


julia/tst/includes/sorting.h → julia/examples/old_examples/includes/sorting.h


julia/tst/mandelbrot/cpu_cuda_mandelbrot.jl → julia/examples/old_examples/mandelbrot/cpu_cuda_mandelbrot.jl


julia/tst/mandelbrot/cpu_mandelbrot.c → julia/examples/old_examples/mandelbrot/cpu_mandelbrot.c


julia/tst/mandelbrot/cpu_mandelbrot_between.c → julia/examples/old_examples/mandelbrot/cpu_mandelbrot_between.c


julia/tst/mandelbrot/gpu_mandelbrot.cu → julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu


julia/tst/mandelbrot/gpu_mandelbrot_between.cu → julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu


julia/tst/mandelbrot/mandelbrot.c → julia/examples/old_examples/mandelbrot/mandelbrot.c


julia/tst/mandelbrot/mandelbrot.jl → julia/examples/old_examples/mandelbrot/mandelbrot.jl


julia/tst/mandelbrot/mandelbrot_between.c → julia/examples/old_examples/mandelbrot/mandelbrot_between.c


julia/tst/mandelbrot/mandelbrot_def.jl → julia/examples/old_examples/mandelbrot/mandelbrot_def.jl


julia/tst/mandelbrot/mandelbrot_generated.jl → julia/examples/old_examples/mandelbrot/mandelbrot_generated.jl


julia/tst/mult.c → julia/examples/old_examples/mult.c


julia/tst/mult/cpu_cuda_mult.jl → julia/examples/old_examples/mult/cpu_cuda_mult.jl


julia/tst/mult/cpu_mult.c → julia/examples/old_examples/mult/cpu_mult.c


julia/tst/mult/gpu_mult.cu → julia/examples/old_examples/mult/gpu_mult.cu


julia/tst/mult/mult.c → julia/examples/old_examples/mult/mult.c


julia/tst/mult/mult_def.jl → julia/examples/old_examples/mult/mult_def.jl


julia/tst/mult/mult_extern.jl → julia/examples/old_examples/mult/mult_extern.jl


julia/tst/mult/mult_extern_graph.jl → julia/examples/old_examples/mult/mult_extern_graph.jl


julia/tst/mult/mult_generated.jl → julia/examples/old_examples/mult/mult_generated.jl


julia/tst/mult/mult_generated_graph.jl → julia/examples/old_examples/mult/mult_generated_graph.jl


julia/tst/mult/mult_naive.jl → julia/examples/old_examples/mult/mult_naive.jl


julia/tst/nbody/cpu_cuda_nbody.jl → julia/examples/old_examples/nbody/cpu_cuda_nbody.jl


julia/tst/nbody/cpu_nbody.c → julia/examples/old_examples/nbody/cpu_nbody.c


julia/tst/nbody/cpu_nbody_between.c → julia/examples/old_examples/nbody/cpu_nbody_between.c


julia/tst/nbody/gpu_nbody.cu → julia/examples/old_examples/nbody/gpu_nbody.cu


julia/tst/nbody/gpu_nbody_between.cu → julia/examples/old_examples/nbody/gpu_nbody_between.cu


julia/tst/nbody/nbody.c → julia/examples/old_examples/nbody/nbody.c


julia/tst/nbody/nbody.jl → julia/examples/old_examples/nbody/nbody.jl


julia/tst/nbody/nbody_between.c → julia/examples/old_examples/nbody/nbody_between.c


julia/tst/nbody/nbody_def.jl → julia/examples/old_examples/nbody/nbody_def.jl


julia/tst/nbody/nbody_display.jl → julia/examples/old_examples/nbody/nbody_display.jl


julia/tst/nbody/nbody_generated.jl → julia/examples/old_examples/nbody/nbody_generated.jl


+ 58 - 0
julia/examples/variable/Makefile

@@ -0,0 +1,58 @@
+CC=gcc
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
+
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+
+EXTERNLIB=extern_tasks.so
+GENERATEDLIB=generated_tasks.so
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
+LIBPATH=${PWD}/../StarPU.jl/lib
+
+all: ${EXTERNLIB}
+
+variable: variable.c cpu_variable.o #gpu_variable.o
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
+
+%.o: %.c
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+
+${EXTERNLIB}: cpu_variable.c
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
+
+.PHONY: clean
+
+clean:
+	rm -f variable *.so *.o genc_*.c gencuda_*.cu *.dat
+
+# Performance Tests
+cstarpu.dat: variable
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./variable -0.800671 -0.158392 32 32 4096 4 > $@
+julia_generatedc.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable.jl $@
+julia_native.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable_native.jl $@
+julia_calllib.dat: ${EXTERNLIB}
+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable.jl julia_calllib.dat
+
+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 38 - 0
julia/examples/variable/variable.jl

@@ -0,0 +1,38 @@
+import Libdl
+using StarPU
+
+@target STARPU_CPU
+@codelet function variable(val ::Ref{Float32}) :: Nothing
+    val[] = val[] + 1
+
+    return
+end
+
+starpu_init()
+
+function variable_with_starpu(val ::Ref{Float32}, niter)
+    @starpu_block let
+	hVal = starpu_data_register(val)
+
+	@starpu_sync_tasks for task in (1 : niter)
+                @starpu_async_cl variable(hVal) [STARPU_RW]
+	end
+    end
+end
+
+function display(niter)
+    foo = Ref(0.0f0)
+
+    variable_with_starpu(foo, niter)
+
+    println("variable -> ", foo[])
+    if foo[] == niter
+        println("result is correct")
+    else
+        println("result is incorret")
+    end
+end
+
+display(10)
+
+starpu_shutdown()

+ 26 - 0
julia/examples/variable/variable_native.jl

@@ -0,0 +1,26 @@
+function variable(val ::Ref{Float32}) :: Nothing
+    val[] = val[] + 1
+
+    return
+end
+
+function variable_without_starpu(val ::Ref{Float32}, niter)
+    for i = 1:niter
+        variable(val)
+    end
+end
+
+function display(niter)
+    foo = Ref(0.0f0)
+
+    variable_without_starpu(foo, niter)
+
+    println("variable -> ", foo[])
+    if foo[] == niter
+        println("result is correct")
+    else
+        println("result is incorret")
+    end
+end
+
+display(10)

+ 58 - 0
julia/examples/vector_scal/Makefile

@@ -0,0 +1,58 @@
+CC=gcc
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
+
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+
+EXTERNLIB=extern_tasks.so
+GENERATEDLIB=generated_tasks.so
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
+LIBPATH=${PWD}/../StarPU.jl/lib
+
+all: ${EXTERNLIB}
+
+vector_scal: vector_scal.c cpu_vector_scal.o #gpu_vector_scal.o
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
+
+%.o: %.c
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+
+${EXTERNLIB}: cpu_vector_scal.c
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
+
+PHONY: clean
+
+clean:
+	rm -f vector_scal *.so *.o genc_*.c gencuda_*.cu *.dat
+
+# Performance Tests
+cstarpu.dat: vector_scal
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./vector_scal > $@
+julia_generatedc.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl $@
+julia_native.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal_native.jl $@
+julia_calllib.dat: ${EXTERNLIB}
+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl julia_calllib.dat
+
+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 42 - 0
julia/examples/vector_scal/cpu_vector_scal.c

@@ -0,0 +1,42 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+#include <math.h>
+
+struct params {
+  int32_t m;
+  float k;
+  float l;
+};
+
+float cpu_vector_scal(void *buffers[], void *cl_arg)
+{
+  /* get scalar parameters from cl_arg */
+  struct params *scalars = (struct params *) cl_arg;
+  int m = scalars->m;
+  float k = scalars->k;
+  float l = scalars->l;
+
+  struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
+
+  /* length of the vector */
+  unsigned n = STARPU_VECTOR_GET_NX(vector);
+
+  /* get a pointer to the local copy of the vector : note that we have to
+   * cast it in (float *) since a vector could contain any type of
+   * elements so that the .ptr field is actually a uintptr_t */
+  float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+  /* scale the vector */
+  for (unsigned i = 0; i < n; i++)
+    val[i] = val[i] * k + l + m;
+
+  return 0.0;
+}
+
+char* CPU = "cpu_vector_scal";
+char* GPU = "gpu_vector_scal";
+extern char *starpu_find_function(char *name, char *device) {
+	if (!strcmp(device,"gpu")) return GPU;
+	return CPU;
+}

+ 75 - 0
julia/examples/vector_scal/vector_scal.jl

@@ -0,0 +1,75 @@
+import Libdl
+using StarPU
+using LinearAlgebra
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function vector_scal(m::Int32, v :: Vector{Float32}, k :: Float32, l :: Float32) :: Float32
+
+    N :: Int32 = length(v)
+    # Naive version
+    @parallel for i in (1 : N)
+        v[i] = v[i] * m + l + k
+    end
+end
+
+
+starpu_init()
+
+function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32, l :: Float32)
+    tmin=0
+
+    @starpu_block let
+        hV = starpu_data_register(v)
+        tmin=0
+        perfmodel = StarpuPerfmodel(
+            perf_type = STARPU_HISTORY_BASED,
+            symbol = "history_perf"
+        )
+        cl = StarpuCodelet(
+            cpu_func = CPU_CODELETS["vector_scal"],
+            # cuda_func = CUDA_CODELETS["vector_scal"],
+            #opencl_func="ocl_matrix_mult",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+
+        for i in (1 : 1)
+            t=time_ns()
+            @starpu_sync_tasks begin
+                handles = [hV]
+                task = StarpuTask(cl = cl, handles = handles, cl_arg=[m, k, l])
+                starpu_task_submit(task)
+            end
+            # @starpu_sync_tasks for task in (1:1)
+            #     @starpu_async_cl vector_scal(hV, STARPU_RW, [m, k, l])
+            # end
+            t=time_ns()-t
+            if (tmin==0 || tmin>t)
+                tmin=t
+            end
+        end
+    end
+    return tmin
+end
+
+function compute_times(io,start_dim, step_dim, stop_dim)
+    for size in (start_dim : step_dim : stop_dim)
+        V = Array(rand(Cfloat, size))
+        m :: Int32 = 10
+        k :: Float32 = 2.
+        l :: Float32 = 3.
+        println("INPUT ", V[1:10])
+        mt =  vector_scal_with_starpu(V, m, k, l)
+        println("OUTPUT ", V[1:10])
+        println(io,"$size $mt")
+        println("$size $mt")
+    end
+end
+
+
+io=open(ARGS[1],"w")
+compute_times(io,1024,1024,4096)
+close(io)
+
+starpu_shutdown()
+

+ 0 - 0
julia/setenv.sh


この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません