浏览代码

merge trunk@6596:6650

Nathalie Furmento 12 年之前
父节点
当前提交
0df511aa30
共有 66 个文件被更改,包括 597 次插入155 次删除
  1. 15 0
      STARPU-VERSION
  2. 4 0
      configure.ac
  3. 6 3
      doc/chapters/basic-api.texi
  4. 2 2
      examples/Makefile.am
  5. 1 1
      examples/lu/clu.c
  6. 1 1
      examples/lu/clu_implicit.c
  7. 1 1
      examples/lu/clu_implicit_pivot.c
  8. 1 1
      examples/lu/clu_kernels.c
  9. 1 1
      examples/lu/clu_pivot.c
  10. 1 1
      examples/lu/dlu.c
  11. 1 1
      examples/lu/dlu_implicit.c
  12. 1 1
      examples/lu/dlu_implicit_pivot.c
  13. 1 1
      examples/lu/dlu_kernels.c
  14. 1 1
      examples/lu/dlu_pivot.c
  15. 0 0
      examples/lu/lu-double.h
  16. 0 0
      examples/lu/lu-float.h
  17. 1 1
      examples/lu/lu_example_complex_double.c
  18. 1 1
      examples/lu/lu_example_complex_float.c
  19. 1 1
      examples/lu/lu_example_double.c
  20. 1 1
      examples/lu/lu_example_float.c
  21. 1 1
      examples/lu/slu.c
  22. 1 1
      examples/lu/slu_implicit.c
  23. 1 1
      examples/lu/slu_implicit_pivot.c
  24. 1 1
      examples/lu/slu_kernels.c
  25. 1 1
      examples/lu/slu_pivot.c
  26. 1 1
      examples/lu/zlu.c
  27. 1 1
      examples/lu/zlu_implicit.c
  28. 1 1
      examples/lu/zlu_implicit_pivot.c
  29. 1 1
      examples/lu/zlu_kernels.c
  30. 1 1
      examples/lu/zlu_pivot.c
  31. 78 12
      gcc-plugin/src/starpu.c
  32. 51 1
      gcc-plugin/tests/mocks.h
  33. 4 1
      gcc-plugin/tests/opencl.c
  34. 2 2
      mpi/Makefile.am
  35. 0 0
      mpi/examples/mpi_lu/mpi_lu-double.h
  36. 0 0
      mpi/examples/mpi_lu/mpi_lu-float.h
  37. 1 1
      mpi/examples/mpi_lu/pdlu.c
  38. 1 1
      mpi/examples/mpi_lu/pdlu_kernels.c
  39. 1 1
      mpi/examples/mpi_lu/plu_example_double.c
  40. 1 1
      mpi/examples/mpi_lu/plu_example_float.c
  41. 1 1
      mpi/examples/mpi_lu/plu_solve_double.c
  42. 1 1
      mpi/examples/mpi_lu/plu_solve_float.c
  43. 1 1
      mpi/examples/mpi_lu/pslu.c
  44. 1 1
      mpi/examples/mpi_lu/pslu_kernels.c
  45. 1 1
      mpi/examples/mpi_lu/slu_kernels.c
  46. 23 23
      socl/examples/matmul/matmul.c
  47. 19 2
      src/common/fxt.c
  48. 38 36
      src/common/fxt.h
  49. 2 2
      src/core/task.c
  50. 43 4
      src/core/workers.c
  51. 1 0
      src/core/workers.h
  52. 14 14
      src/drivers/cpu/driver_cpu.c
  53. 2 2
      starpufft/Makefile.am
  54. 2 2
      starpufft/cuda_kernels.cu
  55. 2 2
      starpufft/cudaf_kernels.cu
  56. 2 2
      starpufft/examples/test.c
  57. 2 2
      starpufft/examples/test_threads.c
  58. 2 2
      starpufft/examples/testf.c
  59. 2 2
      starpufft/examples/testf_threads.c
  60. 0 0
      starpufft/starpufft-double.h
  61. 0 0
      starpufft/starpufft-float.h
  62. 2 2
      starpufft/starpufft.c
  63. 2 2
      starpufft/starpufftf.c
  64. 1 0
      tests/Makefile.am
  65. 239 0
      tests/main/driver_api/init_run_deinit.c
  66. 4 2
      tests/sched_policies/simple_cpu_gpu_sched.c

+ 15 - 0
STARPU-VERSION

@@ -2,6 +2,21 @@
 
 # Versioning (SONAMEs) for StarPU libraries.
 
+# http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html#Updating-version-info
+# Here are a set of rules to help you update your library version information:
+# Start with version information of ‘0:0:0’ for each libtool library.
+# Update the version information only immediately before a public
+# release of your software. More frequent updates are unnecessary, and
+# only guarantee that the current interface number gets larger faster.
+# - If the library source code has changed at all since the last
+#   update, then increment revision (‘c:r:a’ becomes ‘c:r+1:a’).
+# - If any interfaces have been added, removed, or changed since the
+#   last update, increment current, and set revision to 0.
+# - If any interfaces have been added since the last public release,
+#   then increment age.
+# - If any interfaces have been removed or changed since the last
+#   public release, then set age to 0. change
+
 # Libtool interface versioning (info "(libtool) Versioning").
 LIBSTARPU_INTERFACE_CURRENT=1	# increment upon ABI change
 LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change

+ 4 - 0
configure.ac

@@ -1559,6 +1559,10 @@ AS_IF([test "$have_valid_hwloc" = "yes"], [
 	])
 LDFLAGS="${SAVED_LDFLAGS}"
 CPPFLAGS="${SAVED_CPPFLAGS}"
+
+if test "$have_valid_hwloc" = "no" -a "$hwloc_dir" != "no" ; then
+   AC_MSG_ERROR([hwloc was not found on your system. If the target machine is hyperthreaded the performance may be impacted a lot.  It is strongly recommended to install hwloc. However, if you really want to use StarPU without enabling hwloc, please restart configure by specifying the option '--without-hwloc'.])
+fi
 AC_MSG_CHECKING(whether hwloc should be used)
 AC_MSG_RESULT($have_valid_hwloc)
 AC_SUBST(HWLOC_REQUIRES)

+ 6 - 3
doc/chapters/basic-api.texi

@@ -1343,11 +1343,13 @@ option when configuring StarPU.
 
 @item @code{struct starpu_perfmodel *model} (optional)
 This is a pointer to the task duration performance model associated to this
-codelet. This optional field is ignored when set to @code{NULL}.
+codelet. This optional field is ignored when set to @code{NULL} or
+when its @code{symbol} field is not set.
 
 @item @code{struct starpu_perfmodel *power_model} (optional)
 This is a pointer to the task power consumption performance model associated
-to this codelet. This optional field is ignored when set to @code{NULL}.
+to this codelet. This optional field is ignored when set to
+@code{NULL} or when its @code{symbol} field is not set.
 In the case of parallel codelets, this has to account for all processing units
 involved in the parallel execution.
 
@@ -1825,7 +1827,8 @@ archs will be determined by multiplying by an arch-specific factor.
 
 @item @code{const char *symbol}
 is the symbol name for the performance model, which will be used as
-file name to store the model.
+file name to store the model. It must be set otherwise the model will
+be ignored.
 
 @item @code{double (*cost_model)(struct starpu_buffer_descr *)}
 This field is deprecated. Use instead the @code{cost_function} field.

+ 2 - 2
examples/Makefile.am

@@ -115,8 +115,8 @@ noinst_HEADERS = 				\
 	heat/dw_factolu.h			\
 	lu/xlu.h				\
 	lu/xlu_kernels.h			\
-	lu/float.h				\
-	lu/double.h				\
+	lu/lu-float.h				\
+	lu/lu-double.h				\
 	lu/complex_float.h			\
 	lu/complex_double.h			\
 	lu/blas_complex.h			\

+ 1 - 1
examples/lu/clu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_float.h"
+#include "complex_-float.h"
 #include "xlu.c"

+ 1 - 1
examples/lu/clu_implicit.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_float.h"
+#include "complex_-float.h"
 #include "xlu_implicit.c"

+ 1 - 1
examples/lu/clu_implicit_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_float.h"
+#include "complex_-float.h"
 #include "xlu_implicit_pivot.c"

+ 1 - 1
examples/lu/clu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_float.h"
+#include "complex_-float.h"
 #include "xlu_kernels.c"

+ 1 - 1
examples/lu/clu_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_float.h"
+#include "complex_-float.h"
 #include "xlu_pivot.c"

+ 1 - 1
examples/lu/dlu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu.c"

+ 1 - 1
examples/lu/dlu_implicit.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu_implicit.c"

+ 1 - 1
examples/lu/dlu_implicit_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu_implicit_pivot.c"

+ 1 - 1
examples/lu/dlu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu_kernels.c"

+ 1 - 1
examples/lu/dlu_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "xlu_pivot.c"

examples/lu/double.h → examples/lu/lu-double.h


examples/lu/float.h → examples/lu/lu-float.h


+ 1 - 1
examples/lu/lu_example_complex_double.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_double.h"
+#include "complex_-double.h"
 #include "lu_example.c"

+ 1 - 1
examples/lu/lu_example_complex_float.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_float.h"
+#include "complex_-float.h"
 #include "lu_example.c"

+ 1 - 1
examples/lu/lu_example_double.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "lu-double.h"
 #include "lu_example.c"

+ 1 - 1
examples/lu/lu_example_float.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "lu_example.c"

+ 1 - 1
examples/lu/slu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu.c"

+ 1 - 1
examples/lu/slu_implicit.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu_implicit.c"

+ 1 - 1
examples/lu/slu_implicit_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu_implicit_pivot.c"

+ 1 - 1
examples/lu/slu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu_kernels.c"

+ 1 - 1
examples/lu/slu_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "lu-float.h"
 #include "xlu_pivot.c"

+ 1 - 1
examples/lu/zlu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_double.h"
+#include "complex_-double.h"
 #include "xlu.c"

+ 1 - 1
examples/lu/zlu_implicit.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_double.h"
+#include "complex_-double.h"
 #include "xlu_implicit.c"

+ 1 - 1
examples/lu/zlu_implicit_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_double.h"
+#include "complex_-double.h"
 #include "xlu_implicit_pivot.c"

+ 1 - 1
examples/lu/zlu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_double.h"
+#include "complex_-double.h"
 #include "xlu_kernels.c"

+ 1 - 1
examples/lu/zlu_pivot.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "complex_double.h"
+#include "complex_-double.h"
 #include "xlu_pivot.c"

+ 78 - 12
gcc-plugin/src/starpu.c

@@ -1187,6 +1187,18 @@ opencl_event_type (void)
   return t;
 }
 
+/* Return an expression that, given the OpenCL error code in ERROR_VAR,
+   returns a string.  */
+
+static tree
+build_opencl_error_string (tree error_var)
+{
+  static tree clstrerror_fn;
+  LOOKUP_STARPU_FUNCTION (clstrerror_fn, "starpu_opencl_error_string");
+
+  return build_call_expr (clstrerror_fn, 1, error_var);
+}
+
 /* Return an error-checking `clSetKernelArg' call for argument ARG, at
    index IDX, of KERNEL.  */
 
@@ -1198,14 +1210,8 @@ build_opencl_set_kernel_arg_call (location_t loc, tree fn,
   gcc_assert (TREE_CODE (fn) == FUNCTION_DECL
 	      && TREE_TYPE (kernel) == opencl_kernel_type ());
 
-  static tree setkernarg_fn, clstrerror_fn;
+  static tree setkernarg_fn;
   LOOKUP_STARPU_FUNCTION (setkernarg_fn, "clSetKernelArg");
-  LOOKUP_STARPU_FUNCTION (clstrerror_fn, "starpu_opencl_error_string");
-
-  local_define (tree, build_errorstr, (tree error_var))
-  {
-    return build_call_expr (clstrerror_fn, 1, error_var);
-  };
 
   tree call = build_call_expr (setkernarg_fn, 4, kernel,
 			       build_int_cst (integer_type_node, idx),
@@ -1225,7 +1231,8 @@ build_opencl_set_kernel_arg_call (location_t loc, tree fn,
   cond = build3 (COND_EXPR, void_type_node,
 		 build2 (NE_EXPR, boolean_type_node,
 			 error_var, integer_zero_node),
-		 build_error_statements (loc, error_var, build_errorstr,
+		 build_error_statements (loc, error_var,
+					 build_opencl_error_string,
 					 "failed to set OpenCL kernel "
 					 "argument %d", idx),
 		 NULL_TREE);
@@ -1286,7 +1293,8 @@ define_opencl_task_implementation (location_t loc, tree task_impl,
     /* No further warnings for this node.  */
     TREE_NO_WARNING (task_impl) = true;
 
-  static tree load_fn, load_kern_fn, wid_fn, devid_fn;
+  static tree load_fn, load_kern_fn, enqueue_kern_fn, wid_fn, devid_fn, clfinish_fn,
+    collect_stats_fn, release_ev_fn;
 
   if (load_fn == NULL_TREE)
     {
@@ -1303,6 +1311,10 @@ define_opencl_task_implementation (location_t loc, tree task_impl,
   LOOKUP_STARPU_FUNCTION (load_kern_fn, "starpu_opencl_load_kernel");
   LOOKUP_STARPU_FUNCTION (wid_fn, "starpu_worker_get_id");
   LOOKUP_STARPU_FUNCTION (devid_fn, "starpu_worker_get_devid");
+  LOOKUP_STARPU_FUNCTION (enqueue_kern_fn, "clEnqueueNDRangeKernel");
+  LOOKUP_STARPU_FUNCTION (clfinish_fn, "clFinish");
+  LOOKUP_STARPU_FUNCTION (collect_stats_fn, "starpu_opencl_collect_stats");
+  LOOKUP_STARPU_FUNCTION (release_ev_fn, "clReleaseEvent");
 
   if (verbose_output_p)
     inform (loc, "defining %qE, with OpenCL kernel %qs from file %qs",
@@ -1363,13 +1375,15 @@ define_opencl_task_implementation (location_t loc, tree task_impl,
 			       load_stmts);
 
       /* Local variables.  */
-      tree kernel_var, queue_var, event_var, group_size_var, ngroups_var;
+      tree kernel_var, queue_var, event_var, group_size_var, ngroups_var,
+	error_var;
 
       kernel_var = local_var (opencl_kernel_type ());
       queue_var = local_var (opencl_command_queue_type ());
       event_var = local_var (opencl_event_type ());
       group_size_var = local_var (size_type_node);
       ngroups_var = local_var (size_type_node);
+      error_var = local_var (integer_type_node);
 
       /* Build `starpu_opencl_load_kernel (...)'.
          TODO: Check return value.  */
@@ -1385,8 +1399,43 @@ define_opencl_task_implementation (location_t loc, tree task_impl,
 					 TREE_STRING_POINTER (kernel)),
 					devid);
 
-      /* TODO: `clSetKernelArg', `clEnqueueNDRangeKernel', etc.  */
-
+      tree enqueue_kern =
+	build_call_expr (enqueue_kern_fn, 9,
+			 queue_var, kernel_var,
+			 build_int_cst (integer_type_node, 1),
+			 null_pointer_node,
+			 build_addr (group_size_var, task_impl),
+			 build_addr (ngroups_var, task_impl),
+			 integer_zero_node,
+			 null_pointer_node,
+			 build_addr (event_var, task_impl));
+      tree enqueue_err =
+	build2 (INIT_EXPR, TREE_TYPE (error_var), error_var, enqueue_kern);
+
+      tree enqueue_cond =
+	build3 (COND_EXPR, void_type_node,
+		build2 (NE_EXPR, boolean_type_node,
+			error_var, integer_zero_node),
+		build_error_statements (loc, error_var,
+					build_opencl_error_string,
+					"failed to enqueue kernel"),
+		NULL_TREE);
+
+      tree clfinish =
+	build_call_expr (clfinish_fn, 1, queue_var);
+
+      tree collect_stats =
+	build_call_expr (collect_stats_fn, 1, event_var);
+
+      tree release_ev =
+	build_call_expr (release_ev_fn, 1, event_var);
+
+      tree enqueue_stmts = NULL_TREE;
+      append_to_statement_list (enqueue_err, &enqueue_stmts);
+      append_to_statement_list (enqueue_cond, &enqueue_stmts);
+
+
+      /* TODO: Build `clFinish', `clReleaseEvent', & co.  */
       /* Put it all together.  */
       tree stmts = NULL_TREE;
       append_to_statement_list (load_cond, &stmts);
@@ -1396,6 +1445,23 @@ define_opencl_task_implementation (location_t loc, tree task_impl,
 								   kernel_var),
 				&stmts);
 
+      /* TODO: Support user-provided values.  */
+      append_to_statement_list (build2 (INIT_EXPR, TREE_TYPE (group_size_var),
+					group_size_var,
+					build_int_cst (integer_type_node, 1)),
+				&stmts);
+      append_to_statement_list (build2 (INIT_EXPR, TREE_TYPE (ngroups_var),
+					ngroups_var,
+					build_int_cst (integer_type_node, 1)),
+				&stmts);
+      append_to_statement_list (build4 (TARGET_EXPR, void_type_node,
+					error_var, enqueue_stmts,
+					NULL_TREE, NULL_TREE),
+				&stmts);
+      append_to_statement_list (clfinish, &stmts);
+      append_to_statement_list (collect_stats, &stmts);
+      append_to_statement_list (release_ev, &stmts);
+
       /* Bind the local vars.  */
       tree vars = chain_trees (kernel_var, queue_var, event_var,
 			       group_size_var, ngroups_var, NULL_TREE);

+ 51 - 1
gcc-plugin/tests/mocks.h

@@ -440,13 +440,25 @@ typedef int cl_command_queue;
 
 extern cl_int clSetKernelArg (cl_kernel, cl_uint, size_t, const void *);
 
+extern cl_int
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */);
+
 #endif
 
 
 /* Number of `load_opencl_from_string', `load_kernel', and `clSetKernelArg'
    calls.  */
 static unsigned int load_opencl_calls, load_opencl_kernel_calls,
-  opencl_set_kernel_arg_calls;
+  opencl_set_kernel_arg_calls, opencl_enqueue_calls, opencl_finish_calls,
+  opencl_collect_stats_calls, opencl_release_event_calls;
 
 struct load_opencl_arguments
 {
@@ -528,6 +540,44 @@ clSetKernelArg (cl_kernel kernel, cl_uint index, size_t size,
   return 0;
 }
 
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel        kernel,
+                       cl_uint          work_dim,
+                       const size_t *   global_work_offset,
+                       const size_t *   global_work_size,
+                       const size_t *   local_work_size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event * event_wait_list,
+                       cl_event *       event)
+{
+  assert (*global_work_size == 1 && *local_work_size == 1);
+  opencl_enqueue_calls++;
+  return 0;
+}
+
+cl_int
+clFinish (cl_command_queue command_queue)
+{
+  opencl_finish_calls++;
+  return 0;
+}
+
+cl_int
+starpu_opencl_collect_stats (cl_event event)
+{
+  opencl_collect_stats_calls++;
+  return 0;
+}
+
+cl_int
+clReleaseEvent (cl_event event)
+{
+  opencl_release_event_calls++;
+  return 0;
+}
+
+
 const char *
 starpu_opencl_error_string (cl_int s)
 {

+ 4 - 1
gcc-plugin/tests/opencl.c

@@ -64,6 +64,9 @@ main ()
   assert (load_opencl_calls == 1);
   assert (load_opencl_kernel_calls == 3);
   assert (opencl_set_kernel_arg_calls == 3 * 2);
-
+  assert (opencl_enqueue_calls == 3);
+  assert (opencl_finish_calls == 3);
+  assert (opencl_release_event_calls == 3);
+  assert (opencl_collect_stats_calls == 3);
   return EXIT_SUCCESS;
 }

+ 2 - 2
mpi/Makefile.am

@@ -29,8 +29,8 @@ BUILT_SOURCES =
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 EXTRA_DIST = 					\
-	examples/mpi_lu/float.h			\
-	examples/mpi_lu/double.h		\
+	examples/mpi_lu/mpi_lu-float.h		\
+	examples/mpi_lu/mpi_lu-double.h		\
 	examples/mpi_lu/plu_example.c		\
 	examples/mpi_lu/plu_solve.c		\
 	examples/mpi_lu/pxlu.h			\

mpi/examples/mpi_lu/double.h → mpi/examples/mpi_lu/mpi_lu-double.h


mpi/examples/mpi_lu/float.h → mpi/examples/mpi_lu/mpi_lu-float.h


+ 1 - 1
mpi/examples/mpi_lu/pdlu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "mpi_lu-double.h"
 #include "pxlu.c"

+ 1 - 1
mpi/examples/mpi_lu/pdlu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "mpi_lu-double.h"
 #include "pxlu_kernels.c"

+ 1 - 1
mpi/examples/mpi_lu/plu_example_double.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "mpi_lu-double.h"
 #include "plu_example.c"

+ 1 - 1
mpi/examples/mpi_lu/plu_example_float.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "plu_example.c"

+ 1 - 1
mpi/examples/mpi_lu/plu_solve_double.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "mpi_lu-double.h"
 #include "plu_solve.c"

+ 1 - 1
mpi/examples/mpi_lu/plu_solve_float.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "plu_solve.c"

+ 1 - 1
mpi/examples/mpi_lu/pslu.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "pxlu.c"

+ 1 - 1
mpi/examples/mpi_lu/pslu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "pxlu_kernels.c"

+ 1 - 1
mpi/examples/mpi_lu/slu_kernels.c

@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "mpi_lu-float.h"
 #include "xlu_kernels.c"

+ 23 - 23
socl/examples/matmul/matmul.c

@@ -34,9 +34,9 @@
 #define TYPE float
 
 // Basic Matrix dimensions
-#define WA (1024L * BLOCK_SIZE) // Matrix A width
+#define WA (128L * BLOCK_SIZE) // Matrix A width
 #define HA (512L * BLOCK_SIZE) // Matrix A height
-#define WB (1024L * BLOCK_SIZE) // Matrix B width
+#define WB (128L * BLOCK_SIZE) // Matrix B width
 #define HB WA  // Matrix B height
 #define WC WB  // Matrix C width 
 #define HC HA  // Matrix C height
@@ -115,10 +115,8 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 	}
 }
 
-#define shrLog(...) fprintf(stderr, __VA_ARGS__);
-
 // Round Up Division function
-size_t shrRoundUp(int group_size, int global_size) {
+size_t roundUp(int group_size, int global_size) {
 	int r = global_size % group_size;
 	if(r == 0) {
 		return global_size;
@@ -127,18 +125,18 @@ size_t shrRoundUp(int group_size, int global_size) {
 	}
 }
 
-void fillArray(TYPE* pfData, int iSize) {
+void fillArray(TYPE* data, int size) {
 	int i;
 	const TYPE fScale = (TYPE)(1.0f / (float)RAND_MAX);
-	for (i = 0; i < iSize; ++i) {
-		pfData[i] = fScale * rand();
+	for (i = 0; i < size; ++i) {
+		data[i] = fScale * rand();
 	}
 }
 
-void shrPrintArray(float* pfData, int iSize) {
+void printArray(float* data, int size) {
 	int i;
-	for (i = 0; i < iSize; ++i) {
-		shrLog("%d: %.3f\n", i, pfData[i]);
+	for (i = 0; i < size; ++i) {
+		printf("%d: %.3f\n", i, data[i]);
 	}
 }
 
@@ -222,8 +220,10 @@ int main(int argc, const char** argv) {
 	parse_args(argc, argv);
 
 	check(clGetPlatformIDs(5, platforms, &platform_count));
-	if (platform_count == 0)
-		error("No platform found\n");
+	if (platform_count == 0) {
+		printf("No platform found\n");
+		exit(77);
+	}
 
 	cl_uint device_count;
 	cl_uint devs[platform_count];
@@ -350,7 +350,7 @@ int main(int argc, const char** argv) {
 		check(clSetKernelArg(multiplicationKernel[p], 4, sizeof(cl_mem), (void *) &d_B[d]));
 		check(clSetKernelArg(multiplicationKernel[p], 5, sizeof(cl_mem), (void *) &d_C[i]));
 
-		size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE,WC), shrRoundUp(BLOCK_SIZE,workSize[i])};
+		size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])};
 
 		check(clEnqueueNDRangeKernel(commandQueue[p][dev], multiplicationKernel[p], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));
 
@@ -430,26 +430,26 @@ int main(int argc, const char** argv) {
 	return 0;
 }
 
-void printDiff(TYPE *data1, TYPE *data2, int width, int height, int iListLength, TYPE fListTol) {
-	shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
+void printDiff(TYPE *data1, TYPE *data2, int width, int height, int listLength, TYPE listTol) {
+	printf("Listing first %d Differences > %.6f...\n", listLength, listTol);
 	int i,j,k;
 	int error_count=0;
 	for (j = 0; j < height; j++) {
-		if (error_count < iListLength) {
-			shrLog("\n  Row %d:\n", j);
+		if (error_count < listLength) {
+			printf("\n  Row %d:\n", j);
 		}
 		for (i = 0; i < width; i++) {
 			k = j * width + i;
-			float fDiff = fabs(data1[k] - data2[k]);
-			if (fDiff > fListTol) {                
-				if (error_count < iListLength) {
-					shrLog("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
+			float diff = fabs(data1[k] - data2[k]);
+			if (diff > listTol) {                
+				if (error_count < listLength) {
+					printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], diff);
 				}
 				error_count++;
 			}
 		}
 	}
-	shrLog(" \n  Total Errors = %d\n\n", error_count);
+	printf(" \n  Total Errors = %d\n\n", error_count);
 }
 
 /**

+ 19 - 2
src/common/fxt.c

@@ -29,6 +29,10 @@
 #include <windows.h>
 #endif
 
+#ifdef __FreeBSD__
+#include <sys/thr.h> /* for thr_self() */
+#endif
+
 #define _STARPU_PROF_BUFFER_SIZE  (8*1024*1024)
 
 static char _STARPU_PROF_FILE_USER[128];
@@ -38,6 +42,19 @@ static int _starpu_written = 0;
 
 static int _starpu_id;
 
+long _starpu_gettid()
+{
+#if defined(__linux__)
+	return syscall(SYS_gettid);
+#elif defined(__FreeBSD__)
+	long tid;
+	thr_self(&tid);
+	return tid;
+#else
+	return (long) pthread_self();
+#endif
+}
+
 static void _starpu_profile_set_tracefile(void *last, ...)
 {
 	va_list vl;
@@ -78,7 +95,7 @@ void _starpu_start_fxt_profiling(void)
 		_starpu_profile_set_tracefile(NULL);
 	}
 
-	threadid = syscall(SYS_gettid);
+	threadid = _starpu_gettid();
 
 	atexit(_starpu_stop_fxt_profiling);
 
@@ -140,7 +157,7 @@ void _starpu_stop_fxt_profiling(void)
 
 void _starpu_fxt_register_thread(unsigned cpuid)
 {
-	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, cpuid, syscall(SYS_gettid));
+	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, cpuid, _starpu_gettid());
 }
 
 #endif // STARPU_USE_FXT

+ 38 - 36
src/common/fxt.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -107,6 +107,8 @@
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
 
+long _starpu_gettid(void);
+
 /* Initialize the FxT library. */
 void _starpu_start_fxt_profiling(void);
 
@@ -178,13 +180,13 @@ do {									\
 
 /* workerkind = _STARPU_FUT_CPU_KEY for instance */
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)			\
-	FUT_DO_PROBE2(_STARPU_FUT_NEW_MEM_NODE, nodeid, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_NEW_MEM_NODE, nodeid, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_INIT_START(workerkind, devid, memnode)	\
-	FUT_DO_PROBE4(_STARPU_FUT_WORKER_INIT_START, workerkind, devid, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE4(_STARPU_FUT_WORKER_INIT_START, workerkind, devid, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_INIT_END				\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_INIT_END, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid());
 
 #define _STARPU_TRACE_START_CODELET_BODY(job)				\
 do {									\
@@ -192,10 +194,10 @@ do {									\
 	if (model_name)                                                 \
 	{								\
 		/* we include the symbol name */			\
-		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, syscall(SYS_gettid), 1, model_name); \
+		_STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_START_CODELET_BODY, (job), _starpu_gettid(), 1, model_name); \
 	}								\
 	else {                                                          \
-		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, syscall(SYS_gettid), 0); \
+		FUT_DO_PROBE3(_STARPU_FUT_START_CODELET_BODY, (job), _starpu_gettid(), 0); \
 	}								\
 } while(0);
 
@@ -203,35 +205,35 @@ do {									\
 do {									\
 	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
 	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype), syscall(SYS_gettid));	\
+	FUT_DO_PROBE5(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype), _starpu_gettid());	\
 } while(0);
 
 #define _STARPU_TRACE_START_CALLBACK(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_CALLBACK, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_CALLBACK, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_CALLBACK(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_CALLBACK, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_CALLBACK, job, _starpu_gettid());
 
 #define _STARPU_TRACE_JOB_PUSH(task, prio)	\
-	FUT_DO_PROBE3(_STARPU_FUT_JOB_PUSH, task, prio, syscall(SYS_gettid));
+	FUT_DO_PROBE3(_STARPU_FUT_JOB_PUSH, task, prio, _starpu_gettid());
 
 #define _STARPU_TRACE_JOB_POP(task, prio)	\
-	FUT_DO_PROBE3(_STARPU_FUT_JOB_POP, task, prio, syscall(SYS_gettid));
+	FUT_DO_PROBE3(_STARPU_FUT_JOB_POP, task, prio, _starpu_gettid());
 
 #define _STARPU_TRACE_UPDATE_TASK_CNT(counter)	\
-	FUT_DO_PROBE2(_STARPU_FUT_UPDATE_TASK_CNT, counter, syscall(SYS_gettid))
+	FUT_DO_PROBE2(_STARPU_FUT_UPDATE_TASK_CNT, counter, _starpu_gettid())
 
 #define _STARPU_TRACE_START_FETCH_INPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_FETCH_INPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, _starpu_gettid());
 
 #define _STARPU_TRACE_START_PUSH_OUTPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_PUSH_OUTPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT, job, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT, job, _starpu_gettid());
 
 #define _STARPU_TRACE_TAG(tag, job)	\
 	FUT_DO_PROBE2(_STARPU_FUT_TAG, tag, (job)->job_id)
@@ -251,10 +253,10 @@ do {										\
         const char *model_name = _starpu_get_model_name((job));                       \
 	if (model_name)					                        \
 	{									\
-		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DONE, (job)->job_id, syscall(SYS_gettid), (long unsigned)exclude_from_dag, 1, model_name);\
+		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 1, model_name);\
 	}									\
 	else {									\
-		FUT_DO_PROBE4(_STARPU_FUT_TASK_DONE, (job)->job_id, syscall(SYS_gettid), (long unsigned)exclude_from_dag, 0);\
+		FUT_DO_PROBE4(_STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 0);\
 	}									\
 } while(0);
 
@@ -264,10 +266,10 @@ do {										\
         const char *model_name = _starpu_get_model_name((job));                       \
 	if (model_name)                                                         \
 	{									\
-          _STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_TAG_DONE, (tag)->id, syscall(SYS_gettid), 1, model_name); \
+          _STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 1, model_name); \
 	}									\
 	else {									\
-		FUT_DO_PROBE3(_STARPU_FUT_TAG_DONE, (tag)->id, syscall(SYS_gettid), 0);\
+		FUT_DO_PROBE3(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
 	}									\
 } while(0);
 
@@ -290,56 +292,56 @@ do {										\
 	FUT_DO_PROBE2(_STARPU_FUT_WORK_STEALING, empty_q, victim_q)
 
 #define _STARPU_TRACE_WORKER_DEINIT_START			\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_DEINIT_START, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_DEINIT_START, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_DEINIT_END(workerkind)		\
-	FUT_DO_PROBE2(_STARPU_FUT_WORKER_DEINIT_END, workerkind, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_WORKER_DEINIT_END, workerkind, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SLEEP_START	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SLEEP_END	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_END, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_END, _starpu_gettid());
 
 #define _STARPU_TRACE_USER_DEFINED_START	\
-	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_START, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_START, _starpu_gettid());
 
 #define _STARPU_TRACE_USER_DEFINED_END		\
-	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_END, syscall(SYS_gettid));
+	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_END, _starpu_gettid());
 
 #define _STARPU_TRACE_START_ALLOC(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_START_ALLOC, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_ALLOC, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_END_ALLOC(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_START_ALLOC_REUSE(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_START_ALLOC_REUSE, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_ALLOC_REUSE, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_END_ALLOC_REUSE(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC_REUSE, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC_REUSE, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_START_MEMRECLAIM(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_START_MEMRECLAIM, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_START_MEMRECLAIM, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_END_MEMRECLAIM(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_END_MEMRECLAIM, memnode, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_END_MEMRECLAIM, memnode, _starpu_gettid());
 	
 /* We skip these events becasue they are called so often that they cause FxT to
  * fail and make the overall trace unreadable anyway. */
 #define _STARPU_TRACE_START_PROGRESS(memnode)		\
 	do {} while (0);
-//	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS, memnode, syscall(SYS_gettid));
+//	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_END_PROGRESS(memnode)		\
 	do {} while (0);
-	//FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS, memnode, syscall(SYS_gettid));
+	//FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_USER_EVENT(code)			\
-	FUT_DO_PROBE2(_STARPU_FUT_USER_EVENT, code, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_USER_EVENT, code, _starpu_gettid());
 
 #define _STARPU_TRACE_SET_PROFILING(status)		\
-	FUT_DO_PROBE2(_STARPU_FUT_SET_PROFILING, status, syscall(SYS_gettid));
+	FUT_DO_PROBE2(_STARPU_FUT_SET_PROFILING, status, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL			\
 	FUT_DO_PROBE0(_STARPU_FUT_TASK_WAIT_FOR_ALL)

+ 2 - 2
src/core/task.c

@@ -417,10 +417,10 @@ int starpu_task_submit(struct starpu_task *task)
 
 		_starpu_detect_implicit_data_deps(task);
 
-		if (task->cl->model)
+		if (task->cl->model && task->cl->model->symbol)
 			_starpu_load_perfmodel(task->cl->model);
 
-		if (task->cl->power_model)
+		if (task->cl->power_model && task->cl->power_model->symbol)
 			_starpu_load_perfmodel(task->cl->power_model);
 	}
 

+ 43 - 4
src/core/workers.c

@@ -229,6 +229,9 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 
 		switch (d->type)
 		{
+		case STARPU_CPU_WORKER:
+			if (d->id.cpu_id == conf->not_launched_drivers[i].id.cpu_id)
+				return 0;
 		case STARPU_CUDA_WORKER:
 			if (d->id.cuda_id == conf->not_launched_drivers[i].id.cuda_id)
 				return 0;
@@ -257,7 +260,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 	unsigned nworkers = config->topology.nworkers;
 
 	/* Launch workers asynchronously (except for SPUs) */
-	unsigned cuda = 0;
+	unsigned cpu = 0, cuda = 0;
 	unsigned worker;
 	for (worker = 0; worker < nworkers; worker++)
 	{
@@ -273,6 +276,8 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 		workerarg->worker_size = 1;
 		workerarg->combined_workerid = workerarg->workerid;
 		workerarg->current_rank = 0;
+		workerarg->run_by_starpu = 1;
+
 		workerarg->has_prev_init = 0;
 		/* mutex + cond only for the local list */
 		/* we have a single local list */
@@ -304,8 +309,17 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_CPU_WORKER:
 				workerarg->set = NULL;
 				workerarg->worker_is_initialized = 0;
-				pthread_create(&workerarg->worker_thread,
-						NULL, _starpu_cpu_worker, workerarg);
+				driver.id.cpu_id = cpu;
+				if (_starpu_may_launch_driver(config->conf, &driver))
+				{
+					pthread_create(&workerarg->worker_thread,
+							NULL, _starpu_cpu_worker, workerarg);
+				}
+				else
+				{
+					workerarg->run_by_starpu = 0;
+				}
+				cpu++;
 				break;
 #endif
 #ifdef STARPU_USE_CUDA
@@ -318,6 +332,10 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 					pthread_create(&workerarg->worker_thread,
 						       NULL, _starpu_cuda_worker, workerarg);
 				}
+				else
+				{
+					workerarg->run_by_starpu = 0;
+				}
 				cuda++;
 				break;
 #endif
@@ -325,7 +343,10 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 			case STARPU_OPENCL_WORKER:
 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
 				if (!_starpu_may_launch_driver(config->conf, &driver))
+				{
+					workerarg->run_by_starpu = 0;
 					break;
+				}
 				workerarg->set = NULL;
 				workerarg->worker_is_initialized = 0;
 				pthread_create(&workerarg->worker_thread,
@@ -367,6 +388,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 		}
 	}
 
+	cpu  = 0;
 	cuda = 0;
 	for (worker = 0; worker < nworkers; worker++)
 	{
@@ -377,10 +399,17 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 		switch (workerarg->arch)
 		{
 			case STARPU_CPU_WORKER:
+				driver.id.cpu_id = cpu;
+				if (!_starpu_may_launch_driver(config->conf, &driver))
+				{
+					cpu++;
+					break;
+				}
 				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
 				while (!workerarg->worker_is_initialized)
 					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
 				_STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+				cpu++;
 				break;
 			case STARPU_CUDA_WORKER:
 				driver.id.cuda_id = cuda;
@@ -667,6 +696,9 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *config)
 		}
 		else
 		{
+			if (!worker->run_by_starpu)
+				goto out;
+
 			if (!pthread_equal(pthread_self(), worker->worker_thread))
 			{
 				status = pthread_join(worker->worker_thread, NULL);
@@ -679,6 +711,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *config)
 			}
 		}
 
+out:
 		STARPU_ASSERT(starpu_task_list_empty(&worker->local_tasks));
 		_starpu_job_list_delete(worker->terminated_jobs);
 	}
@@ -1082,6 +1115,9 @@ struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void)
 	return &config.sched_ctxs[0];
 }
 
+#ifdef STARPU_USE_CPU
+extern int _starpu_run_cpu(struct starpu_driver *);
+#endif
 #ifdef STARPU_USE_CUDA
 extern int _starpu_run_cuda(struct starpu_driver *);
 #endif
@@ -1097,6 +1133,10 @@ starpu_driver_run(struct starpu_driver *d)
 
 	switch (d->type)
 	{
+#ifdef STARPU_USE_CPU
+	case STARPU_CPU_WORKER:
+		return _starpu_run_cpu(d);
+#endif
 #ifdef STARPU_USE_CUDA
 	case STARPU_CUDA_WORKER:
 		return _starpu_run_cuda(d);
@@ -1105,7 +1145,6 @@ starpu_driver_run(struct starpu_driver *d)
 	case STARPU_OPENCL_WORKER:
 		return _starpu_run_opencl(d);
 #endif
-	case STARPU_CPU_WORKER:    /* Not supported yet */
 	case STARPU_GORDON_WORKER: /* Not supported yet */
 	default:
 		return -EINVAL;

+ 1 - 0
src/core/workers.h

@@ -81,6 +81,7 @@ struct _starpu_worker
 	enum _starpu_worker_status status; /* what is the worker doing now ? (eg. CALLBACK) */
 	char name[48];
 	char short_name[10];
+	unsigned run_by_starpu; /* Is this run by StarPU or directly by the application ? */
 
 	struct _starpu_sched_ctx **sched_ctx;
 	unsigned nsched_ctxs; /* the no of contexts a worker belongs to*/

+ 14 - 14
src/drivers/cpu/driver_cpu.c

@@ -101,7 +101,6 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 static struct _starpu_worker*
 _starpu_get_worker_from_driver(struct starpu_driver *d)
 {
-#if 1
 	int workers[d->id.cpu_id + 1];
 	int nworkers;
 	nworkers = starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, d->id.cpu_id+1);
@@ -109,19 +108,6 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 		return NULL; // No device was found.
 	
 	return _starpu_get_worker_struct(workers[d->id.cpu_id]);
-#else
-	int workers[STARPU_NMAXWORKERS];
-	int nworkers;
-	nworkers = starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, STARPU_NMAXWORKERS);
-	STARPU_ASSERT(nworkers > 0);
-	int i;
-	for (i = 0; i < nworkers; i++)
-	{
-		fprintf(stderr, "\tCPU %d\n", i);
-	}
-
-	return _starpu_get_worker_struct(workers[d->id.cpu_id]);
-#endif
 }
 
 int _starpu_cpu_driver_init(struct starpu_driver *d)
@@ -345,3 +331,17 @@ _starpu_cpu_worker(void *arg)
 
 	return NULL;
 }
+
+int _starpu_run_cpu(struct starpu_driver *d)
+{
+	STARPU_ASSERT(d && d->type == STARPU_CPU_WORKER);
+
+	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	STARPU_ASSERT(worker);
+
+	worker->set = NULL;
+	worker->worker_is_initialized = 0;
+	_starpu_cpu_worker(worker);
+
+	return 0;
+}

+ 2 - 2
starpufft/Makefile.am

@@ -19,8 +19,8 @@ AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include $(STARPU_CUDA_C
 lib_LTLIBRARIES = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la
 
 EXTRA_DIST =			\
-	float.h			\
-	double.h		\
+	starpufft-float.h	\
+	starpufft-double.h	\
 	cudax_kernels.h		\
 	starpufftx.c		\
 	starpufftx1d.c		\

+ 2 - 2
starpufft/cuda_kernels.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "starpufft-double.h"
 #include "cudax_kernels.cu"

+ 2 - 2
starpufft/cudaf_kernels.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "starpufft-float.h"
 #include "cudax_kernels.cu"

+ 2 - 2
starpufft/examples/test.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "starpufft-double.h"
 #include "testx.c"

+ 2 - 2
starpufft/examples/test_threads.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "starpufft-double.h"
 #include "testx_threads.c"

+ 2 - 2
starpufft/examples/testf.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "starpufft-float.h"
 #include "testx.c"

+ 2 - 2
starpufft/examples/testf_threads.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "starpufft-float.h"
 #include "testx_threads.c"

starpufft/double.h → starpufft/starpufft-double.h


starpufft/float.h → starpufft/starpufft-float.h


+ 2 - 2
starpufft/starpufft.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
+#include "starpufft-double.h"
 #include "starpufftx.c"

+ 2 - 2
starpufft/starpufftf.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,5 +15,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
+#include "starpufft-float.h"
 #include "starpufftx.c"

+ 1 - 0
tests/Makefile.am

@@ -115,6 +115,7 @@ noinst_PROGRAMS =				\
 	starpu_machine_display			\
 	main/deprecated_func			\
 	main/deprecated_buffer			\
+	main/driver_api/init_run_deinit         \
 	main/restart				\
 	main/execute_on_a_specific_worker	\
 	main/insert_task			\

+ 239 - 0
tests/main/driver_api/init_run_deinit.c

@@ -0,0 +1,239 @@
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+#include "../../helper.h"
+
+#define NTASKS 8
+
+#if defined(STARPU_USE_CPU) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+static void
+dummy(void *buffers[], void *args)
+{
+	(void) buffers;
+	(*(int *)args)++;
+}
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs    = { dummy, NULL },
+	.cuda_funcs   = { dummy, NULL },
+	.opencl_funcs = { dummy, NULL },
+	.nbuffers     = 0
+};
+
+static void
+init_driver(struct starpu_driver *d)
+{
+	int ret;
+	ret = starpu_driver_init(d);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_init");
+}
+
+static void
+run(struct starpu_task *task, struct starpu_driver *d)
+{
+	int ret;
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_driver_run_once(d);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
+}
+
+static void
+deinit_driver(struct starpu_driver *d)
+{
+	int ret; 
+	ret = starpu_driver_deinit(d);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_deinit");
+}
+#endif /* STARPU_USE_CPU || STARPU_USE_CUDA || STARPU_USE_OPENCL */
+
+#ifdef STARPU_USE_CPU
+static int
+test_cpu(void)
+{
+	int var = 0, ret;
+	struct starpu_conf conf;
+
+	ret = starpu_conf_init(&conf);
+	if (ret == -EINVAL)
+		return 1;
+	
+
+	struct starpu_driver d =
+	{
+		.type = STARPU_CPU_WORKER,
+		.id.cpu_id = 0
+	};
+
+	conf.not_launched_drivers = &d;
+	conf.n_not_launched_drivers = 1;
+
+	ret = starpu_init(&conf);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+
+	init_driver(&d);
+	int i;	
+	for (i = 0; i < NTASKS; i++)
+	{
+		struct starpu_task *task;
+		task = starpu_task_create();
+		cl.where = STARPU_CPU;
+		task->cl = &cl;
+		task->cl_arg = &var;
+
+		run(task, &d);
+	}
+	deinit_driver(&d);
+
+	starpu_task_wait_for_all();
+	starpu_shutdown();
+
+	fprintf(stderr, "[CPU] Var is %d\n", var);
+	return !!(var != NTASKS);
+}
+#endif /* STARPU_USE_CPU */
+
+#ifdef STARPU_USE_CUDA
+static int
+test_cuda(void)
+{
+	int var = 0, ret;
+	struct starpu_conf conf;
+
+	ret = starpu_conf_init(&conf);
+	if (ret == -EINVAL)
+		return 1;
+	
+
+	struct starpu_driver d =
+	{
+		.type = STARPU_CUDA_WORKER,
+		.id.cuda_id = 0
+	};
+
+	conf.ncuda = 1;
+	conf.nopencl = 0;
+	conf.not_launched_drivers = &d;
+	conf.n_not_launched_drivers = 1;
+
+	ret = starpu_init(&conf);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+
+
+	init_driver(&d);
+	int i;	
+	for (i = 0; i < NTASKS; i++)
+	{
+		struct starpu_task *task;
+		task = starpu_task_create();
+		cl.where = STARPU_CUDA;
+		task->cl = &cl;
+		task->cl_arg = &var;
+
+		run(task, &d);
+	}
+	deinit_driver(&d);
+
+	starpu_task_wait_for_all();
+	starpu_shutdown();
+
+	fprintf(stderr, "[CUDA] Var is %d\n", var);
+	return !!(var != NTASKS);
+}
+#endif /* STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+static int
+test_opencl(void)
+{
+        cl_int err;
+        cl_platform_id platform;
+        cl_uint dummy;
+
+        err = clGetPlatformIDs(1, &platform, &dummy);
+        if (err != CL_SUCCESS)
+        {   
+                fprintf(stderr, "%s:%d\n", __func__, __LINE__);
+		return 1;
+	}
+
+	cl_device_id device_id;
+        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
+        if (err != CL_SUCCESS)
+        {   
+                fprintf(stderr, "%s:%d\n", __func__, __LINE__);
+		return 1;
+	}
+
+	int var = 0, ret;
+	struct starpu_conf conf;
+
+	ret = starpu_conf_init(&conf);
+	if (ret == -EINVAL)
+		return 1;
+	
+	struct starpu_driver d =
+	{
+		.type = STARPU_OPENCL_WORKER,
+		.id.opencl_id = device_id
+	};
+
+	conf.ncuda = 0;
+	conf.nopencl = 1;
+	conf.not_launched_drivers = &d;
+	conf.n_not_launched_drivers = 1;
+
+	ret = starpu_init(&conf);
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+
+
+	init_driver(&d);
+	int i;	
+	for (i = 0; i < NTASKS; i++)
+	{
+		struct starpu_task *task;
+		task = starpu_task_create();
+		cl.where = STARPU_OPENCL;
+		task->cl = &cl;
+		task->cl_arg = &var;
+
+		run(task, &d);
+	}
+	deinit_driver(&d);
+
+
+	starpu_task_wait_for_all();
+	starpu_shutdown();
+
+	FPRINTF(stderr, "[OpenCL] Var is %d\n", var);
+	return !!(var != NTASKS);
+}
+#endif /* STARPU_USE_OPENCL */
+
+int
+main(void)
+{
+	int ret = STARPU_TEST_SKIPPED;
+
+#ifdef STARPU_USE_CPU
+	ret = test_cpu();
+	if (ret == 1)
+		return ret;
+#endif
+#ifdef STARPU_USE_CUDA
+	ret = test_cuda();
+	if (ret == 1)
+		return ret;
+#endif
+#ifdef STARPU_USE_OPENCL
+	ret = test_opencl();
+	if (ret == 1)
+		return ret;
+#endif
+
+	return ret;
+}

+ 4 - 2
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -88,11 +88,13 @@ gpu_task_gpu(struct starpu_task *task,
 
 static struct starpu_perfmodel model_cpu_task = 
 {
-	.type = STARPU_PER_ARCH
+	.type = STARPU_PER_ARCH,
+	.symbol = "model_cpu_task"
 };
 static struct starpu_perfmodel model_gpu_task = 
 {
-	.type = STARPU_PER_ARCH
+	.type = STARPU_PER_ARCH,
+	.symbol = "model_gpu_task"
 };
 
 static void