Browse Source

merge again

Andra Hugo 13 years ago
parent
commit
c91789a855

+ 82 - 1
doc/chapters/advanced-examples.texi

@@ -10,10 +10,11 @@
 * Using multiple implementations of a codelet::
 * Enabling implementation according to capabilities::
 * Task and Worker Profiling::   
-* Partitioning Data::           Partitioning Data
+* Partitioning Data::
 * Performance model example::   
 * Theoretical lower bound on execution time::  
 * Insert Task Utility::          
+* Data reduction::  
 * Parallel Tasks::
 * Debugging::
 * The multiformat interface::
@@ -587,6 +588,86 @@ be executed, and is allowed to read from @code{i} to use it e.g. as an
 index. Note that this macro is only avaible when compiling StarPU with
 the compiler @code{gcc}.
 
+@node Data reduction
+@section Data reduction
+
+In various cases, some piece of data is used to accumulate intermediate
+results. For instances, the dot product of a vector, maximum/minimum finding,
+the histogram of a photograph, etc. When these results are produced along the
+whole machine, it would not be efficient to accumulate them in only one place,
+incurring data transmission each and access concurrency.
+
+StarPU provides a @code{STARPU_REDUX} mode, which permits to optimize
+that case: it will allocate a buffer on each memory node, and accumulate
+intermediate results there. When the data is eventually accessed in the normal
+@code{STARPU_R} mode, StarPU will collect the intermediate results in just one
+buffer.
+
+For this to work, the user has to use the
+@code{starpu_data_set_reduction_methods} to declare how to initialize these
+buffers, and how to assemble partial results.
+
+For instance, @code{cg} uses that to optimize its dot product: it first defines
+the codelets for initialization and reduction:
+
+@smallexample
+struct starpu_codelet bzero_variable_cl =
+@{
+        .cpu_funcs = @{ bzero_variable_cpu, NULL @},
+        .cuda_funcs = @{ bzero_variable_cuda, NULL @},
+        .nbuffers = 1,
+@}
+
+static void accumulate_variable_cpu(void *descr[], void *cl_arg)
+@{
+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        *v_dst = *v_dst + *v_src;
+@}
+
+static void accumulate_variable_cuda(void *descr[], void *cl_arg)
+@{
+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
+@}
+
+struct starpu_codelet accumulate_variable_cl =
+@{
+        .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
+        .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
+        .nbuffers = 1,
+@}
+@end smallexample
+
+and attaches them as reduction methods for its dtq handle:
+
+@smallexample
+starpu_data_set_reduction_methods(dtq_handle,
+        &accumulate_variable_cl, &bzero_variable_cl);
+@end smallexample
+
+and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
+with partitioned vectors:
+
+@smallexample
+int dots(starpu_data_handle v1, starpu_data_handle v2,
+         starpu_data_handle s, unsigned nblocks)
+@{
+    starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+    for (b = 0; b < nblocks; b++)
+        starpu_insert_task(&dot_kernel_cl,
+            STARPU_RW, s,
+            STARPU_R, starpu_data_get_sub_data(v1, 1, b),
+            STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+            0);
+@}
+@end smallexample
+
+The @code{cg} example also uses reduction for the blocked gemv kernel, leading
+to yet more relaxed dependencies and more parallelism.
+
 @node Parallel Tasks
 @section Parallel Tasks
 

File diff suppressed because it is too large
+ 1 - 1
doc/chapters/basic-api.texi


+ 2 - 1
doc/chapters/c-extensions.texi

@@ -10,7 +10,8 @@
 When GCC plug-in support is available, StarPU builds a plug-in for the
 GNU Compiler Collection (GCC), which defines extensions to languages of
 the C family (C, C++, Objective-C) that make it easier to write StarPU
-code@footnote{This feature is only available for GCC 4.5 and later.  You
+code@footnote{This feature is only available for GCC 4.5 and later; it
+is known to work with GCC 4.5, 4.6, and 4.7.  You
 may need to install a specific @code{-dev} package of your distro, such
 as @code{gcc-4.6-plugin-dev} on Debian and derivatives.  In addition,
 the plug-in's test suite is only run when

+ 4 - 0
doc/chapters/perf-optimization.texi

@@ -75,6 +75,10 @@ mode, or because write accesses are actually commutative), use the
 @code{starpu_data_set_sequential_consistency_flag} function to disable implicit
 dependencies on that data.
 
+In the same vein, accumulation of results in the same data can become a
+bottleneck. The use of the @code{STARPU_REDUX} mode permits to optimize such
+accumulation (@pxref{Data reduction}).
+
 @node Task granularity
 @section Task granularity
 

+ 1 - 1
examples/interface/complex_interface.c

@@ -261,7 +261,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
 	return 0;
 }
 #endif
-static const struct starpu_data_copy_methods complex_copy_methods =
+static struct starpu_data_copy_methods complex_copy_methods =
 {
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,

+ 41 - 16
gcc-plugin/src/c-expr.y

@@ -1,5 +1,5 @@
 /* GCC-StarPU
-   Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+   Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 
    GCC-StarPU is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -95,28 +95,53 @@
   /* Mapping of libcpp token names to Bison-generated token names.  This is
      not ideal but Bison cannot be told to use the `enum cpp_ttype'
      values.  */
+
+#define STARPU_CPP_TOKENS			\
+  TK (CPP_NAME)					\
+  TK (CPP_NUMBER)				\
+  TK (CPP_AND)					\
+  TK (CPP_OPEN_SQUARE)				\
+  TK (CPP_CLOSE_SQUARE)				\
+  TK (CPP_OPEN_PAREN)				\
+  TK (CPP_CLOSE_PAREN)				\
+  TK (CPP_PLUS)					\
+  TK (CPP_MINUS)				\
+  TK (CPP_MULT)					\
+  TK (CPP_DIV)					\
+  TK (CPP_DOT)					\
+  TK (CPP_DEREF)
+
+#ifndef __cplusplus
+
   static const int cpplib_bison_token_map[] =
     {
-      [CPP_NAME] = YCPP_NAME,
-      [CPP_NUMBER] = YCPP_NUM,
-      [CPP_AND] = YCPP_AND,
-      [CPP_OPEN_SQUARE] = YCPP_OPEN_SQUARE,
-      [CPP_CLOSE_SQUARE] = YCPP_CLOSE_SQUARE,
-      [CPP_OPEN_PAREN] = YCPP_OPEN_PAREN,
-      [CPP_CLOSE_PAREN] = YCPP_CLOSE_PAREN,
-      [CPP_PLUS] = YCPP_PLUS,
-      [CPP_MINUS] = YCPP_MINUS,
-      [CPP_MULT] = YCPP_MULT,
-      [CPP_DIV] = YCPP_DIV,
-      [CPP_DOT] = YCPP_DOT,
-      [CPP_DEREF] = YCPP_DEREF
+# define TK(x) [x] = Y ## x,
+      STARPU_CPP_TOKENS
+# undef TK
     };
 
+#else /* __cplusplus */
+
+  /* No designated initializers in C++.  */
+  static int cpplib_bison_token_map[CPP_PADDING];
+
+#endif	/* __cplusplus */
+
   static int
   yylex (YYSTYPE *lvalp)
   {
     int ret;
 
+#ifdef __cplusplus
+    if (cpplib_bison_token_map[CPP_NAME] != YCPP_NAME)
+      {
+	/* Initialize the table.  */
+# define TK(x) cpplib_bison_token_map[x] = Y ## x;
+	STARPU_CPP_TOKENS
+# undef TK
+      }
+#endif
+
     ret = pragma_lex (lvalp);
     if (ret < sizeof cpplib_bison_token_map / sizeof cpplib_bison_token_map[0])
       ret = cpplib_bison_token_map[ret];
@@ -128,7 +153,7 @@
 }
 
 %token YCPP_NAME "identifier"
-%token YCPP_NUM "integer"
+%token YCPP_NUMBER "integer"
 %token YCPP_AND "&"
 %token YCPP_OPEN_SQUARE "["
 %token YCPP_CLOSE_SQUARE "]"
@@ -228,7 +253,7 @@ primary_expression: identifier
      | YCPP_OPEN_PAREN expression YCPP_CLOSE_PAREN { $$ = $2; }
 ;
 
-constant: YCPP_NUM { $$ = $1; }
+constant: YCPP_NUMBER { $$ = $1; }
 ;
 
 %%

+ 2 - 0
gcc-plugin/src/starpu-gcc-config.h.in

@@ -28,6 +28,8 @@
 
 #undef HAVE_DECL_BUILTIN_DECL_EXPLICIT
 
+#undef HAVE_ATTRIBUTE_SPEC_AFFECTS_TYPE_IDENTITY
+
 #undef HAVE_C_FAMILY_C_COMMON_H
 #undef HAVE_C_COMMON_H
 

+ 99 - 58
gcc-plugin/src/starpu.c

@@ -60,6 +60,41 @@
 #include <starpu.h>  /* for `STARPU_CPU' & co.  */
 
 
+/* GCC 4.7 requires compilation with `g++', and C++ lacks a number of GNU C
+   features, so work around that.  */
+
+#ifdef __cplusplus
+
+/* G++ doesn't implement nested functions, so use C++11 lambdas instead.  */
+
+# include <functional>
+
+# define local_define(ret, name, parms)     auto name = [=]parms
+# define function_parm(ret, name, parms)    std::function<ret parms> name
+
+/* G++ lacks designated initializers.  */
+# define designated_field_init(name, value) value /* XXX: cross fingers */
+
+#else  /* !__cplusplus */
+
+/* GNU C nested functions.  */
+
+# define local_define(ret, name, parms)	    ret name parms
+# define function_parm(ret, name, parms)    ret (*name) parms
+
+/* Designated field initializer.  */
+
+# define designated_field_init(name, value) .name = value
+
+#endif	/* !__cplusplus */
+
+
+/* C expression parser, possibly with C++ linkage.  */
+
+extern int yyparse (location_t, const char *, tree *);
+extern int yydebug;
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -264,7 +299,7 @@ array_type_element_count (location_t loc, const_tree array_type)
 static tree
 build_constructor_from_unsorted_list (tree type, tree vals)
 {
-  int compare_elmt_bitpos (const void *rt1, const void *rt2)
+  local_define (int, compare_elmt_bitpos, (const void *rt1, const void *rt2))
   {
     const constructor_elt *elmt1 = (constructor_elt *) rt1;
     const constructor_elt *elmt2 = (constructor_elt *) rt2;
@@ -274,7 +309,7 @@ build_constructor_from_unsorted_list (tree type, tree vals)
       = tree_int_cst_compare (bit_position (field1), bit_position (field2));
 
     return ret ? ret : (int) (DECL_UID (field1) - DECL_UID (field2));
-  }
+  };
 
   tree t;
   VEC(constructor_elt,gc) *v = NULL;
@@ -433,7 +468,7 @@ chain_trees (tree t, ...)
 }
 
 static tree
-filter (bool (*pred) (const_tree), tree t)
+filter (function_parm (bool, pred, (const_tree)), tree t)
 {
   tree result, lst;
 
@@ -451,12 +486,12 @@ filter (bool (*pred) (const_tree), tree t)
 }
 
 static tree
-list_remove (bool (*pred) (const_tree), tree t)
+list_remove (function_parm (bool, pred, (const_tree)), tree t)
 {
-  bool opposite (const_tree t)
+  local_define (bool, opposite, (const_tree t))
   {
     return !pred (t);
-  }
+  };
 
   return filter (opposite, t);
 }
@@ -465,7 +500,7 @@ list_remove (bool (*pred) (const_tree), tree t)
    chain of arbitrary tree objects.  */
 
 static tree
-map (tree (*func) (const_tree), tree t)
+map (function_parm (tree, func, (const_tree)), tree t)
 {
   tree result, tail, lst;
 
@@ -485,7 +520,7 @@ map (tree (*func) (const_tree), tree t)
 }
 
 static void
-for_each (void (*func) (tree), tree t)
+for_each (function_parm (void, func, (tree)), tree t)
 {
   tree lst;
 
@@ -496,7 +531,7 @@ for_each (void (*func) (tree), tree t)
 }
 
 static size_t
-count (bool (*pred) (const_tree), const_tree t)
+count (function_parm (bool, pred, (const_tree)), const_tree t)
 {
   size_t result;
   const_tree lst;
@@ -604,9 +639,6 @@ handle_pragma_wait (struct cpp_reader *reader)
 
 /* The minimal C expression parser.  */
 
-extern int yyparse (location_t, const char *, tree *);
-extern int yydebug;
-
 /* Parse expressions from the CPP reader for PRAGMA, which is located at LOC.
    Return a TREE_LIST of C expressions.  */
 
@@ -1169,10 +1201,10 @@ handle_task_implementation_attribute (tree *node, tree name, tree args,
 		    where);
       else if (task_implementation_target_to_int (where) == STARPU_OPENCL)
 	{
-	  void validate (tree t)
+	  local_define (void, validate, (tree t))
 	  {
 	    validate_opencl_argument_type (loc, t);
-	  }
+	  };
 
 	  for_each (validate, TYPE_ARG_TYPES (TREE_TYPE (fn)));
 	}
@@ -1494,27 +1526,35 @@ register_task_attributes (void *gcc_data, void *user_data)
     {
       task_attribute_name, 0, 0, true, false, false,
       handle_task_attribute
+#ifdef HAVE_ATTRIBUTE_SPEC_AFFECTS_TYPE_IDENTITY
+      , false
+#endif
     };
 
   static const struct attribute_spec task_implementation_attr =
     {
       task_implementation_attribute_name, 2, 2, true, false, false,
       handle_task_implementation_attribute
+#ifdef HAVE_ATTRIBUTE_SPEC_AFFECTS_TYPE_IDENTITY
+      , false
+#endif
     };
 
   static const struct attribute_spec heap_allocated_attr =
     {
       heap_allocated_attribute_name, 0, 0, true, false, false,
       handle_heap_allocated_attribute
+#ifdef HAVE_ATTRIBUTE_SPEC_AFFECTS_TYPE_IDENTITY
+      , false
+#endif
     };
 
   static const struct attribute_spec output_attr =
     {
       output_attribute_name, 0, 0, true, true, false,
-      handle_output_attribute,
-#if 0 /* FIXME: Check whether the `affects_type_identity' field is
-	 present.  */
-      true /* affects type identity */
+      handle_output_attribute
+#ifdef HAVE_ATTRIBUTE_SPEC_AFFECTS_TYPE_IDENTITY
+      , true /* affects type identity */
 #endif
     };
 
@@ -1556,7 +1596,7 @@ build_codelet_wrapper_identifier (tree task_impl)
   id = DECL_NAME (task_impl);
   task_name = IDENTIFIER_POINTER (id);
 
-  cl_name = alloca (IDENTIFIER_LENGTH (id) + strlen (suffix) + 1);
+  cl_name = (char *) alloca (IDENTIFIER_LENGTH (id) + strlen (suffix) + 1);
   memcpy (cl_name, task_name, IDENTIFIER_LENGTH (id));
   strcpy (&cl_name[IDENTIFIER_LENGTH (id)], suffix);
 
@@ -1571,12 +1611,16 @@ static tree
 build_codelet_wrapper_definition (tree task_impl)
 {
   location_t loc;
-  tree task_decl, decl;
+  tree task_decl, wrapper_name, decl;
 
   loc = DECL_SOURCE_LOCATION (task_impl);
   task_decl = task_implementation_task (task_impl);
 
-  tree build_local_var (const_tree type)
+  wrapper_name = build_codelet_wrapper_identifier (task_impl);
+  decl = build_decl (loc, FUNCTION_DECL, wrapper_name,
+		     build_codelet_wrapper_type ());
+
+  local_define (tree, build_local_var, (const_tree type))
   {
     tree var, t;
     const char *seed;
@@ -1589,12 +1633,12 @@ build_codelet_wrapper_definition (tree task_impl)
     DECL_ARTIFICIAL (var) = true;
 
     return var;
-  }
+  };
 
   /* Return the body of the wrapper, which unpacks `cl_args' and calls the
      user-defined task implementation.  */
 
-  tree build_body (tree wrapper_decl, tree vars)
+  local_define (tree, build_body, (tree wrapper_decl, tree vars))
   {
     bool opencl_p;
     tree stmts = NULL, call, v;
@@ -1669,12 +1713,12 @@ build_codelet_wrapper_definition (tree task_impl)
     TREE_TYPE (bind) = TREE_TYPE (TREE_TYPE (wrapper_decl));
 
     return bind;
-  }
+  };
 
   /* Return the parameter list of the wrapper:
      `(void **BUFFERS, void *CL_ARGS)'.  */
 
-  tree build_parameters (tree wrapper_decl)
+  local_define (tree, build_parameters, (tree wrapper_decl))
   {
     tree param1, param2;
 
@@ -1693,13 +1737,9 @@ build_codelet_wrapper_definition (tree task_impl)
     TREE_USED (param2) = true;
 
     return chainon (param1, param2);
-  }
-
-  tree wrapper_name, vars, result;
+  };
 
-  wrapper_name = build_codelet_wrapper_identifier (task_impl);
-  decl = build_decl (loc, FUNCTION_DECL, wrapper_name,
-		     build_codelet_wrapper_type ());
+  tree vars, result;
 
   vars = map (build_local_var,
 	      list_remove (void_type_p,
@@ -1749,7 +1789,7 @@ build_codelet_wrapper_definition (tree task_impl)
 static void
 define_codelet_wrappers (tree task)
 {
-  void define (tree task_impl)
+  local_define (void, define, (tree task_impl))
   {
     tree wrapper_def;
 
@@ -1759,7 +1799,7 @@ define_codelet_wrappers (tree task)
       tree_cons (get_identifier (task_implementation_wrapper_attribute_name),
 		 wrapper_def,
 		 DECL_ATTRIBUTES (task_impl));
-  }
+  };
 
   for_each (define, task_implementation_list (task));
 }
@@ -1779,7 +1819,7 @@ build_codelet_identifier (tree task_decl)
   id = DECL_NAME (task_decl);
   task_name = IDENTIFIER_POINTER (id);
 
-  cl_name = alloca (IDENTIFIER_LENGTH (id) + strlen (suffix) + 1);
+  cl_name = (char *) alloca (IDENTIFIER_LENGTH (id) + strlen (suffix) + 1);
   memcpy (cl_name, task_name, IDENTIFIER_LENGTH (id));
   strcpy (&cl_name[IDENTIFIER_LENGTH (id)], suffix);
 
@@ -1840,7 +1880,7 @@ build_codelet_initializer (tree task_decl)
   fields = TYPE_FIELDS (codelet_type ());
   gcc_assert (TREE_CODE (fields) == FIELD_DECL);
 
-  tree lookup_field (const char *name)
+  local_define (tree, lookup_field, (const char *name))
   {
     tree fdecl, fname;
 
@@ -1855,9 +1895,9 @@ build_codelet_initializer (tree task_decl)
 
     /* Field NAME wasn't found.  */
     gcc_assert (false);
-  }
+  };
 
-  tree field_initializer (const char *name, tree value)
+  local_define (tree, field_initializer, (const char *name, tree value))
   {
     tree field, init;
 
@@ -1872,15 +1912,15 @@ build_codelet_initializer (tree task_decl)
       TREE_VALUE (init) = value;
 
     return init;
-  }
+  };
 
-  tree codelet_name ()
+  local_define (tree, codelet_name, ())
   {
     const char *name = IDENTIFIER_POINTER (DECL_NAME (task_decl));
     return build_string_literal (strlen (name) + 1, name);
-  }
+  };
 
-  tree where_init (tree impls)
+  local_define (tree, where_init, (tree impls))
   {
     tree impl;
     int where_int = 0;
@@ -1903,9 +1943,9 @@ build_codelet_initializer (tree task_decl)
       }
 
     return build_int_cstu (integer_type_node, where_int);
-  }
+  };
 
-  tree implementation_pointers (tree impls, int where)
+  local_define (tree, implementation_pointers, (tree impls, int where))
   {
     size_t len;
     tree impl, pointers;
@@ -1943,17 +1983,17 @@ build_codelet_initializer (tree task_decl)
     return build_constructor_from_list (build_array_type (ptr_type_node,
 							  index_type),
 					nreverse (pointers));
-  }
+  };
 
-  tree pointer_arg_count (void)
+  local_define (tree, pointer_arg_count, (void))
   {
     size_t len;
 
     len = list_length (task_pointer_parameter_types (task_decl));
     return build_int_cstu (integer_type_node, len);
-  }
+  };
 
-  tree access_mode_array (void)
+  local_define (tree, access_mode_array, (void))
   {
     const_tree type;
     tree modes;
@@ -1975,7 +2015,7 @@ build_codelet_initializer (tree task_decl)
     return build_constructor_from_list (build_array_type (integer_type_node,
 							  index_type),
 					nreverse (modes));
-  }
+  };
 
   if (verbose_output_p)
     inform (DECL_SOURCE_LOCATION (task_decl),
@@ -2044,7 +2084,7 @@ handle_pre_genericize (void *gcc_data, void *user_data)
 	  /* TASK lacks a body.  Declare its codelet, intantiate its codelet
 	     wrappers, and its body in this compilation unit.  */
 
-	  tree build_parameter (const_tree lst)
+	  local_define (tree, build_parameter, (const_tree lst))
 	  {
 	    tree param, type;
 
@@ -2056,7 +2096,7 @@ handle_pre_genericize (void *gcc_data, void *user_data)
 	    DECL_CONTEXT (param) = task;
 
 	    return param;
-	  }
+	  };
 
 	  /* Declare TASK's codelet.  It cannot be defined yet because the
 	     complete list of tasks isn't available at this point.  */
@@ -2380,9 +2420,10 @@ lower_starpu (void)
 
 static struct opt_pass pass_lower_starpu =
   {
-    .type = GIMPLE_PASS,
-    .name = "pass_lower_starpu",
-    .execute = lower_starpu,
+    designated_field_init (type, GIMPLE_PASS),
+    designated_field_init (name, "pass_lower_starpu"),
+    designated_field_init (gate, NULL),
+    designated_field_init (execute, lower_starpu)
 
     /* The rest is zeroed.  */
   };
@@ -2432,10 +2473,10 @@ plugin_init (struct plugin_name_args *plugin_info,
 
   struct register_pass_info pass_info =
     {
-      .pass = &pass_lower_starpu,
-      .reference_pass_name = "*build_cgraph_edges",
-      .ref_pass_instance_number = 1,
-      .pos_op = PASS_POS_INSERT_AFTER
+      designated_field_init (pass, &pass_lower_starpu),
+      designated_field_init (reference_pass_name, "*build_cgraph_edges"),
+      designated_field_init (ref_pass_instance_number, 1),
+      designated_field_init (pos_op, PASS_POS_INSERT_AFTER)
     };
 
   register_callback (plugin_name, PLUGIN_PASS_MANAGER_SETUP,
@@ -2443,7 +2484,7 @@ plugin_init (struct plugin_name_args *plugin_info,
 
   include_dir = getenv ("STARPU_GCC_INCLUDE_DIR");
 
-  size_t arg;
+  int arg;
   for (arg = 0; arg < plugin_info->argc; arg++)
     {
       if (strcmp (plugin_info->argv[arg].key, "include-dir") == 0)

+ 1 - 1
include/starpu_config.h.in

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify

+ 1 - 1
include/starpu_data_interfaces.h

@@ -110,7 +110,7 @@ struct starpu_data_interface_ops
 	/* Free data of the interface on a given node. */
 	void (*free_data_on_node)(void *data_interface, uint32_t node);
 	/* ram/cuda/spu/opencl synchronous and asynchronous transfer methods */
-	const struct starpu_data_copy_methods *copy_methods;
+	struct starpu_data_copy_methods *copy_methods;
 	/* Return the current pointer (if any) for the handle on the given node. */
 	void * (*handle_to_pointer)(starpu_data_handle_t handle, uint32_t node);
 	/* Return an estimation of the size of data, for performance models */

+ 1 - 1
include/starpu_deprecated_api.h

@@ -23,7 +23,7 @@ extern "C"
 {
 #endif
 
-#warning Your application is still using deprecated types. Please update to use the latest API, e.g. using tools/dev/rename.sh
+#warning "Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh"
 
 typedef starpu_data_handle_t starpu_data_handle;
 typedef struct starpu_block_interface starpu_block_interface_t;

+ 4 - 1
include/starpu_util.h

@@ -49,7 +49,10 @@ extern "C"
 #  endif
 #endif
 
-#define STARPU_ABORT()		assert(0)
+#define STARPU_ABORT() do {                                          \
+	fprintf(stderr, "%s:%d %s\n", __FILE__, __LINE__, __func__); \
+	abort();                                                     \
+} while(0)
 
 #if defined(STARPU_HAVE_STRERROR_R)
 #  define STARPU_CHECK_RETURN_VALUE(err, message) {if (err < 0) { \

+ 12 - 0
m4/gcc.m4

@@ -149,6 +149,7 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
     dnl   build_array_ref           -- present but undeclared in 4.6.1
     dnl   build_zero_cst            -- not in GCC 4.5.x; appears in 4.6
     dnl   builtin_decl_explicit     -- new in 4.7, replaces `built_in_decls'
+    dnl   .affects_type_identity    -- new field in 4.7
     _STARPU_WITH_GCC_PLUGIN_API([
       AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec,
                       build_array_ref, build_zero_cst,
@@ -161,6 +162,14 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
       AC_CHECK_HEADERS([c-common.h c-pragma.h c-family/c-common.h c-family/c-pragma.h],
         [], [], [#include <gcc-plugin.h>
 	         #include <tree.h>])
+
+      AC_CHECK_MEMBER([struct attribute_spec.affects_type_identity],
+        [AC_DEFINE([HAVE_ATTRIBUTE_SPEC_AFFECTS_TYPE_IDENTITY], [1],
+	  [Define to 1 when `struct attribute_spec' has the `affects_type_identity' field.])],
+	[],
+	[#include <gcc-plugin.h>
+	 #include <tree.h>])
+
     ])
 
 
@@ -175,6 +184,9 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
     dnl Determine the corresponding Libtool tag.
     if test "$GCC_FOR_PLUGIN" = "$CXX"; then
       GCC_FOR_PLUGIN_LIBTOOL_TAG="CXX"
+
+      # Require C++11, for lambdas and `auto'.
+      GCC_FOR_PLUGIN="$GCC_FOR_PLUGIN -std=c++11"
     else
       GCC_FOR_PLUGIN_LIBTOOL_TAG="CC"
     fi

+ 2 - 0
src/core/workers.c

@@ -384,6 +384,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 	     conf->single_combined_worker = 0;
 
 	conf->disable_asynchronous_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_COPY");
+	if (conf->disable_asynchronous_copy == -1)
+		conf->disable_asynchronous_copy = 0;
 
 	return 0;
 }

+ 2 - 2
src/datawizard/interfaces/bcsr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -42,7 +42,7 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #endif
 
-static const struct starpu_data_copy_methods bcsr_copy_data_methods_s =
+static struct starpu_data_copy_methods bcsr_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,

+ 1 - 1
src/datawizard/interfaces/block_interface.c

@@ -42,7 +42,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods block_copy_data_methods_s =
+static struct starpu_data_copy_methods block_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,

+ 2 - 2
src/datawizard/interfaces/csr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -42,7 +42,7 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 #endif
 
-static const struct starpu_data_copy_methods csr_copy_data_methods_s =
+static struct starpu_data_copy_methods csr_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,

+ 6 - 12
src/datawizard/interfaces/data_interface.c

@@ -292,20 +292,14 @@ void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 	if (STARPU_UNLIKELY(asynchronous_copy_disabled))
 	{
 #ifdef STARPU_USE_CUDA
-	     if (ops->copy_methods->ram_to_cuda_async)
-		  ((struct starpu_data_copy_methods *)ops->copy_methods)->ram_to_cuda_async = NULL;
-	     if (ops->copy_methods->cuda_to_ram_async)
-		  ((struct starpu_data_copy_methods *)ops->copy_methods)->cuda_to_ram_async = NULL;
-	     if (ops->copy_methods->cuda_to_cuda_async)
-		  ((struct starpu_data_copy_methods *)ops->copy_methods)->cuda_to_cuda_async = NULL;
+		ops->copy_methods->ram_to_cuda_async = NULL;
+		ops->copy_methods->cuda_to_ram_async = NULL;
+		ops->copy_methods->cuda_to_cuda_async = NULL;
 #endif
 #ifdef STARPU_USE_OPENCL
-	     if (ops->copy_methods->ram_to_opencl_async)
-		  ((struct starpu_data_copy_methods *)ops->copy_methods)->ram_to_opencl_async = NULL;
-	     if (ops->copy_methods->opencl_to_ram_async)
-		  ((struct starpu_data_copy_methods *)ops->copy_methods)->opencl_to_ram_async = NULL;
-	     if (ops->copy_methods->opencl_to_opencl_async)
-		  ((struct starpu_data_copy_methods *)ops->copy_methods)->opencl_to_opencl_async = NULL;
+		ops->copy_methods->ram_to_opencl_async = NULL;
+		ops->copy_methods->opencl_to_ram_async = NULL;
+		ops->copy_methods->opencl_to_opencl_async = NULL;
 #endif
 	}
 

+ 1 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -46,7 +46,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
+static struct starpu_data_copy_methods matrix_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,

+ 1 - 1
src/datawizard/interfaces/multiformat_interface.c

@@ -41,7 +41,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
+static struct starpu_data_copy_methods multiformat_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,

+ 1 - 1
src/datawizard/interfaces/variable_interface.c

@@ -42,7 +42,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods variable_copy_data_methods_s =
+static struct starpu_data_copy_methods variable_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,

+ 1 - 1
src/datawizard/interfaces/vector_interface.c

@@ -42,7 +42,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods vector_copy_data_methods_s =
+static struct starpu_data_copy_methods vector_copy_data_methods_s =
 {
 	.ram_to_ram = copy_ram_to_ram,
 	.ram_to_spu = NULL,

+ 2 - 2
src/datawizard/interfaces/void_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -33,7 +33,7 @@ static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *d
 static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event);
 #endif
 
-static const struct starpu_data_copy_methods void_copy_data_methods_s =
+static struct starpu_data_copy_methods void_copy_data_methods_s =
 {
 	.ram_to_ram = dummy_copy,
 	.ram_to_spu = dummy_copy,

+ 28 - 15
src/sched_policies/eager_central_priority_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  *
@@ -161,7 +161,7 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 
 	unsigned priolevel = task->priority - STARPU_MIN_PRIO;
 
-	starpu_task_list_push_front(&taskq->taskq[priolevel], task);
+	starpu_task_list_push_back(&taskq->taskq[priolevel], task);
 	taskq->ntasks[priolevel]++;
 	taskq->total_ntasks++;
 
@@ -174,20 +174,19 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 
 static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 {
-	/* XXX FIXME: should call starpu_worker_can_execute_task!! */
-	struct starpu_task *task = NULL;
+		struct starpu_task *chosen_task = NULL, *task;
+	unsigned workerid = starpu_worker_get_id();
+	int skipped = 0;
 
 	eager_central_prio_data *data = (eager_central_prio_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	
 	struct _starpu_priority_taskq *taskq = data->taskq;
 
 	/* block until some event happens */
-	_STARPU_PTHREAD_MUTEX_LOCK(&data->sched_mutex);
 
 	if ((taskq->total_ntasks == 0) && _starpu_machine_is_running())
 	{
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-		_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
 		return NULL;
 #else
 		_STARPU_PTHREAD_COND_WAIT(&data->sched_cond, &data->sched_mutex);
@@ -201,20 +200,34 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 		{
 			if (taskq->ntasks[priolevel] > 0)
 			{
-				/* there is some task that we can grab */
-				task = starpu_task_list_pop_back(&taskq->taskq[priolevel]);
-				taskq->ntasks[priolevel]--;
-				taskq->total_ntasks--;
-				_STARPU_TRACE_JOB_POP(task, 0);
+				for (task  = starpu_task_list_begin(&taskq->taskq[priolevel]);
+				     task != starpu_task_list_end(&taskq->taskq[priolevel]);
+				     task  = starpu_task_list_next(task)) {
+					unsigned nimpl;
+					for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+					{
+						if (starpu_worker_can_execute_task(workerid, task, nimpl))
+						{
+							/* there is some task that we can grab */
+							_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
+							starpu_task_list_erase(&taskq->taskq[priolevel], task);
+							chosen_task = task;
+							taskq->ntasks[priolevel]--;
+							taskq->total_ntasks--;
+							_STARPU_TRACE_JOB_POP(task, 0);
+						} else skipped = 1;
+					}
+				}
 			}
 		}
-		while (!task && priolevel-- > 0);
+		while (!chosen_task && priolevel-- > 0);
 	}
-	STARPU_ASSERT_MSG(starpu_worker_can_execute_task(starpu_worker_get_id(), task, 0), "prio does not support \"can_execute\"");
 
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
+	if (!chosen_task && skipped)
+		/* Notify another worker to do that task */
+		_STARPU_PTHREAD_COND_SIGNAL(&data->sched_mutex);
 
-	return task;
+	return chosen_task;
 }
 
 struct starpu_sched_policy _starpu_sched_prio_policy =

+ 21 - 1
src/sched_policies/parallel_heft.c

@@ -43,6 +43,9 @@ typedef struct {
 	double beta;
 	double _gamma;
 	double idle_power;
+* When we push a task on a combined worker we need all the cpu workers it contains
+ * to be locked at once */
+	pthread_mutex_t global_push_mutex;
 } pheft_data;
 
 static double worker_exp_start[STARPU_NMAXWORKERS];
@@ -86,6 +89,8 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 {
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
+	
+	pheft_data *hd = (pheft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
 	/* Is this a basic worker or a combined worker ? */
 	int nbasic_workers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
@@ -116,7 +121,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		ntasks[best_workerid]++;
 		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
+		/* We don't want it to interlace its task with a combined
+		 * worker's one */
+		_STARPU_PTHREAD_MUTEX_LOCK(&hd->global_push_mutex);
+
 		ret = starpu_push_local_task(best_workerid, task, prio);
+
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&hd->global_push_mutex);
 	}
 	else
 	{
@@ -140,6 +151,9 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
 		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
 
+		/* All cpu workers must be locked at once */
+		_STARPU_PTHREAD_MUTEX_LOCK(&hd->global_push_mutex);
+
 		int i;
 		for (i = 0; i < worker_size; i++)
 		{
@@ -163,6 +177,10 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 			ret |= starpu_push_local_task(local_worker, alias, prio);
 		}
 
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&hd->global_push_mutex);
+
+		//TODO : free task
+
 	}
 
 	return ret;
@@ -555,7 +573,8 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 	const char *strval_idle_power = getenv("STARPU_IDLE_POWER");
 	if (strval_idle_power)
 		hd->idle_power = atof(strval_idle_power);
-
+	
+	_STARPU_PTHREAD_MUTEX_INIT(&hd->global_push_mutex, NULL);
 
 }
 
@@ -563,6 +582,7 @@ static void parallel_heft_deinit(unsigned sched_ctx_id)
 {
 	pheft_data *hd = (pheft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&hd->global_push_mutex);
 	free(hd);
 }