Explorar o código

merge from trunk

Olivier Aumage %!s(int64=8) %!d(string=hai) anos
pai
achega
0a24097305

+ 1 - 0
AUTHORS

@@ -10,6 +10,7 @@ Nicolas Collin <nicolas.collin@inria.fr>
 Ludovic Courtès <ludovic.courtes@inria.fr>
 Yann Courtois <yann.courtois33@gmail.com>
 Jean-Marie Couteyen <jm.couteyen@gmail.com>
+Lionel Eyraud-Dubois <lionel.eyraud-dubois@inria.fr>
 Nathalie Furmento <nathalie.furmento@labri.fr>
 David Gómez <david_gomez1380@yahoo.com.mx>
 Sylvain Henry <sylvain.henry@inria.fr>

+ 3 - 0
ChangeLog

@@ -57,6 +57,9 @@ New features:
   * Add starpu_data_acquire_try and starpu_data_acquire_on_node_try.
   * Add NVCC_CC environment variable.
   * Add -no-foo options to starpu_fxt_tool to make traces lighter
+  * Add starpu_cusparse_init/shutdown/get_local_handle for proper CUDA
+    overlapping with cusparse.
+
 
 Small changes:
   * Output generated through STARPU_MPI_COMM has been modified to

+ 2 - 1
Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2016  Université de Bordeaux
+# Copyright (C) 2009-2017  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
 # Copyright (C) 2014  INRIA
 # Copyright (C) 2016  Inria
@@ -99,6 +99,7 @@ versinclude_HEADERS = 				\
 	include/starpu_disk.h			\
 	include/starpu_cublas.h			\
 	include/starpu_cublas_v2.h		\
+	include/starpu_cusparse.h		\
 	include/starpu_driver.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_thread.h			\

+ 3 - 0
configure.ac

@@ -1116,6 +1116,9 @@ if test x$enable_cuda = xyes; then
 	fi
 
 	AC_CHECK_HEADERS([cuda_gl_interop.h])
+
+	AC_CHECK_LIB([cusparse], [cusparseCreate])
+	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
 fi
 
 dnl Hey dude, are you around?

+ 20 - 1
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -65,9 +65,13 @@ Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
 CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
 the beginning of the codelet to make sure that CUBLAS is really using the proper
-stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
+stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS
 kernels with the proper configuration.
 
+Similarly, calling starpu_cusparse_init() makes StarPU create CUSPARSE handles
+on each CUDA device, starpu_cusparse_get_local_handle() can then be used to
+queue CUSPARSE kernels with the proper configuration.
+
 If the kernel can be made to only use this local stream or other self-allocated
 streams, i.e. the whole kernel submission can be made asynchronous, then
 one should enable asynchronous execution of the kernel.  That means setting
@@ -78,6 +82,21 @@ able to submit and complete data transfers while kernels are executing, instead
 kernel submission. The kernel just has to make sure that StarPU can use the
 local stream to synchronize with the kernel startup and completion.
 
+If the kernel uses its own non-default stream, one can synchronize that stream
+with the StarPU-provided stream this way:
+
+\code{.c}
+cudaEvent_t event;
+call_kernel_with_its_own_stream()
+cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+cudaEventRecord(event, get_kernel_stream());
+cudaStreamWaitEvent(starpu_cuda_get_local_stream(), event, 0);
+cudaEventDestroy(event);
+\endcode
+
+That code makes the StarPU-provided stream wait for a new event, which will be
+triggered by the completion of the kernel.
+
 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
 execution, on cards which support it (Kepler and later, notably). This is
 enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the

+ 17 - 0
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -95,4 +95,21 @@ Report a cublas error.
 Calls starpu_cublas_report_error(), passing the current
 function, file and line position.
 
+\fn void starpu_cusparse_init(void)
+\ingroup API_CUDA_Extensions
+Calling starpu_cusparse_init() will initialize CUSPARSE on every CUDA device
+controlled by StarPU. This call blocks until CUSPARSE has been properly
+initialized on every device.
+
+\fn cusparseHandle_t starpu_cusparse_get_local_handle(void)
+\ingroup API_CUDA_Extensions
+This function returns the CUSPARSE handle to be used to queue CUSPARSE
+kernels. It is properly initialized and configured for multistream by
+starpu_cusparse_init().
+
+\fn void starpu_cusparse_shutdown(void)
+\ingroup API_CUDA_Extensions
+This function synchronously deinitializes the CUSPARSE library on
+every CUDA device.
+
 */

+ 8 - 2
examples/sched_ctx/dummy_sched_with_ctx.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010-2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -118,8 +118,14 @@ static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
 	 * the calling worker. So we just take the head of the list and give it
 	 * to the worker. */
 	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+#ifdef STARPU_NON_BLOCKING_DRIVERS
+	if (starpu_task_list_empty(&data->sched_list))
+		return NULL;
+#endif
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
-	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
+	struct starpu_task *task = NULL;
+	if (!starpu_task_list_empty(&data->sched_list))
+		task = starpu_task_list_pop_back(&data->sched_list);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	return task;
 }

+ 4 - 2
examples/scheduler/dummy_sched.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010-2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -122,7 +122,9 @@ static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
 		return NULL;
 #endif
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
-	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
+	struct starpu_task *task = NULL;
+	if (!starpu_task_list_empty(&data->sched_list))
+		task = starpu_task_list_pop_back(&data->sched_list);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	return task;
 }

+ 1 - 0
include/starpu.h

@@ -62,6 +62,7 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_rand.h>
 #include <starpu_cuda.h>
 #include <starpu_cublas.h>
+#include <starpu_cusparse.h>
 #include <starpu_bound.h>
 #include <starpu_hash.h>
 #include <starpu_profiling.h>

+ 5 - 1
include/starpu_cublas_v2.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,6 +18,8 @@
 #ifndef __STARPU_CUBLAS_V2_H__
 #define __STARPU_CUBLAS_V2_H__
 
+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
+
 #include <cublas_v2.h>
 
 #ifdef __cplusplus
@@ -31,4 +33,6 @@ cublasHandle_t starpu_cublas_get_local_handle(void);
 }
 #endif
 
+#endif
+
 #endif /* __STARPU_CUBLAS_V2_H__ */

+ 38 - 0
include/starpu_cusparse.h

@@ -0,0 +1,38 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_CUSPARSE_H__
+#define __STARPU_CUSPARSE_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void starpu_cusparse_init(void);
+void starpu_cusparse_shutdown(void);
+
+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
+#include <cusparse.h>
+cusparseHandle_t starpu_cusparse_get_local_handle(void);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_CUSPARSE_H__ */

+ 3 - 0
include/starpu_sched_component.h

@@ -92,6 +92,9 @@ int starpu_sched_tree_push_task(struct starpu_task *task);
 int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task);
 struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx);
 struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to);
+struct starpu_task* starpu_sched_component_pump_downstream(struct starpu_sched_component *component, int* success);
+void starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component);
+
 void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 

+ 33 - 125
include/starpu_task_list.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2016  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2016-2017  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,148 +25,56 @@ extern "C"
 {
 #endif
 
+	/* NOTE: this needs to have at least the same size as lists in src/common/list.h */
+#ifdef BUILDING_STARPU
+#define STARPU_TASK_LIST_INLINE extern inline
+#else
 struct starpu_task_list
 {
 	struct starpu_task *head;
 	struct starpu_task *tail;
 };
+#define STARPU_TASK_LIST_INLINE extern
+#endif
 
-static STARPU_INLINE
-void starpu_task_list_init(struct starpu_task_list *list)
-{
-	list->head = NULL;
-	list->tail = NULL;
-}
-
-static STARPU_INLINE
-void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task)
-{
-	if (list->tail == NULL)
-	{
-		list->tail = task;
-	}
-	else
-	{
-		list->head->prev = task;
-	}
-
-	task->prev = NULL;
-	task->next = list->head;
-	list->head = task;
-}
-
-static STARPU_INLINE
-void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task)
-{
-	if (list->head == NULL)
-	{
-		list->head = task;
-	}
-	else
-	{
-		list->tail->next = task;
-	}
-
-	task->next = NULL;
-	task->prev = list->tail;
-	list->tail = task;
-}
-
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
-{
-	return list->head;
-}
-
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
-{
-	return list->tail;
-}
-
-static STARPU_INLINE
-int starpu_task_list_empty(struct starpu_task_list *list)
-{
-	return (list->head == NULL);
-}
+STARPU_TASK_LIST_INLINE
+void starpu_task_list_init(struct starpu_task_list *list);
 
-static STARPU_INLINE
-void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task)
-{
-	struct starpu_task *p = task->prev;
-
-	if (p)
-	{
-		p->next = task->next;
-	}
-	else
-	{
-		list->head = task->next;
-	}
-
-	if (task->next)
-	{
-		task->next->prev = p;
-	}
-	else
-	{
-		list->tail = p;
-	}
-
-	task->prev = NULL;
-	task->next = NULL;
-}
+STARPU_TASK_LIST_INLINE
+void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
-{
-	struct starpu_task *task = list->head;
+STARPU_TASK_LIST_INLINE
+void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task);
 
-	if (task)
-		starpu_task_list_erase(list, task);
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_front(const struct starpu_task_list *list);
 
-	return task;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_back(const struct starpu_task_list *list);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
-{
-	struct starpu_task *task = list->tail;
+STARPU_TASK_LIST_INLINE
+int starpu_task_list_empty(const struct starpu_task_list *list);
 
-	if (task)
-		starpu_task_list_erase(list, task);
+STARPU_TASK_LIST_INLINE
+void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task);
 
-	return task;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
-{
-	return list->head;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_end(struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED)
-{
-	return NULL;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_begin(const struct starpu_task_list *list);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_next(struct starpu_task *task)
-{
-	return task->next;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_end(struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED);
 
-static STARPU_INLINE
-int starpu_task_list_ismember(struct starpu_task_list *list, struct starpu_task *look)
-{
-	struct starpu_task *task;
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_next(const struct starpu_task *task);
 
-	for (task  = list->head; task != NULL; task  = task->next)
-		if (task == look)
-			return 1;
-	return 0;
-}
+STARPU_TASK_LIST_INLINE
+int starpu_task_list_ismember(const struct starpu_task_list *list, const struct starpu_task *look);
 
 #ifdef __cplusplus
 }

+ 2 - 0
src/Makefile.am

@@ -168,6 +168,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	common/thread.c						\
 	common/rbtree.c						\
 	common/graph.c						\
+	common/inlines.c					\
 	core/jobs.c						\
 	core/task.c						\
 	core/task_bundle.c					\
@@ -317,6 +318,7 @@ endif
 endif
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cusparse.c
 
 if STARPU_USE_OPENCL
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c

+ 22 - 0
src/common/inlines.c

@@ -0,0 +1,22 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2017  Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* This includes the inline definitions in a .c file so that they can also be
+ * referenced from outside */
+
+#define LIST_INLINE
+#define PRIO_LIST_INLINE
+#include <core/task.h>

+ 42 - 38
src/common/list.h

@@ -126,6 +126,9 @@
  */
 
 
+#ifndef LIST_INLINE
+#define LIST_INLINE static inline
+#endif
 
 /**@hideinitializer
  * Generates a new type for list of elements */
@@ -146,75 +149,76 @@
  * The effective type declaration for lists */
 #define LIST_CREATE_TYPE_NOSTRUCT(ENAME, _prev, _next) \
   /** @internal */ \
+ /* NOTE: this must not be greater than the struct defined in include/starpu_task_list.h */ \
   struct ENAME##_list \
   { \
     struct ENAME *_head; /**< @internal head of the list */ \
     struct ENAME *_tail; /**< @internal tail of the list */ \
   }; \
-  /** @internal */static inline struct ENAME *ENAME##_new(void) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_new(void) \
     { struct ENAME *e; _STARPU_MALLOC(e, sizeof(struct ENAME)); \
       e->_next = NULL; e->_prev = NULL; return e; } \
-  /** @internal */static inline void ENAME##_delete(struct ENAME *e) \
+  /** @internal */LIST_INLINE void ENAME##_delete(struct ENAME *e) \
     { free(e); } \
-  /** @internal */static inline void ENAME##_list_push_front(struct ENAME##_list *l, struct ENAME *e) \
+  /** @internal */LIST_INLINE void ENAME##_list_push_front(struct ENAME##_list *l, struct ENAME *e) \
     { if(l->_tail == NULL) l->_tail = e; else l->_head->_prev = e; \
       e->_prev = NULL; e->_next = l->_head; l->_head = e; } \
-  /** @internal */static inline void ENAME##_list_push_back(struct ENAME##_list *l, struct ENAME *e) \
+  /** @internal */LIST_INLINE void ENAME##_list_push_back(struct ENAME##_list *l, struct ENAME *e) \
     { if(l->_head == NULL) l->_head = e; else l->_tail->_next = e; \
       e->_next = NULL; e->_prev = l->_tail; l->_tail = e; } \
-  /** @internal */static inline void ENAME##_list_insert_before(struct ENAME##_list *l, struct ENAME *e, struct ENAME *o) \
+  /** @internal */LIST_INLINE void ENAME##_list_insert_before(struct ENAME##_list *l, struct ENAME *e, struct ENAME *o) \
     { struct ENAME *p = o->_prev; if (p) { p->_next = e; e->_prev = p; } else { l->_head = e; e->_prev = NULL; } \
       e->_next = o; o->_prev = e; } \
-  /** @internal */static inline void ENAME##_list_insert_after(struct ENAME##_list *l, struct ENAME *e, struct ENAME *o) \
+  /** @internal */LIST_INLINE void ENAME##_list_insert_after(struct ENAME##_list *l, struct ENAME *e, struct ENAME *o) \
     { struct ENAME *n = o->_next; if (n) { n->_prev = e; e->_next = n; } else { l->_tail = e; e->_next = NULL; } \
       e->_prev = o; o->_next = e; } \
-  /** @internal */static inline void ENAME##_list_push_list_front(struct ENAME##_list *l1, struct ENAME##_list *l2) \
+  /** @internal */LIST_INLINE void ENAME##_list_push_list_front(struct ENAME##_list *l1, struct ENAME##_list *l2) \
     { if (l2->_head == NULL) { l2->_head = l1->_head; l2->_tail = l1->_tail; } \
       else if (l1->_head != NULL) { l1->_tail->_next = l2->_head; l2->_head->_prev = l1->_tail; l2->_head = l1->_head; } } \
-  /** @internal */static inline void ENAME##_list_push_list_back(struct ENAME##_list *l1, struct ENAME##_list *l2) \
+  /** @internal */LIST_INLINE void ENAME##_list_push_list_back(struct ENAME##_list *l1, struct ENAME##_list *l2) \
     { if(l1->_head == NULL) { l1->_head = l2->_head; l1->_tail = l2->_tail; } \
       else if (l2->_head != NULL) { l1->_tail->_next = l2->_head; l2->_head->_prev = l1->_tail; l1->_tail = l2->_tail; } } \
-  /** @internal */static inline struct ENAME *ENAME##_list_front(const struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_front(const struct ENAME##_list *l) \
     { return l->_head; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_back(const struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_back(const struct ENAME##_list *l) \
     { return l->_tail; } \
-  /** @internal */static inline void ENAME##_list_init(struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE void ENAME##_list_init(struct ENAME##_list *l) \
     { l->_head=NULL; l->_tail=l->_head; } \
-  /** @internal */static inline struct ENAME##_list *ENAME##_list_new(void) \
+  /** @internal */LIST_INLINE struct ENAME##_list *ENAME##_list_new(void) \
     { struct ENAME##_list *l; _STARPU_MALLOC(l, sizeof(struct ENAME##_list)); \
       ENAME##_list_init(l); return l; } \
-  /** @internal */static inline int ENAME##_list_empty(const struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE int ENAME##_list_empty(const struct ENAME##_list *l) \
     { return (l->_head == NULL); } \
-  /** @internal */static inline void ENAME##_list_delete(struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE void ENAME##_list_delete(struct ENAME##_list *l) \
     { free(l); } \
-  /** @internal */static inline void ENAME##_list_erase(struct ENAME##_list *l, struct ENAME *c) \
+  /** @internal */LIST_INLINE void ENAME##_list_erase(struct ENAME##_list *l, struct ENAME *c) \
     { struct ENAME *p = c->_prev; if(p) p->_next = c->_next; else l->_head = c->_next; \
       if(c->_next) c->_next->_prev = p; else l->_tail = p; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_pop_front(struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_pop_front(struct ENAME##_list *l) \
     { struct ENAME *e = ENAME##_list_front(l); \
       ENAME##_list_erase(l, e); return e; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_pop_back(struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_pop_back(struct ENAME##_list *l) \
     { struct ENAME *e = ENAME##_list_back(l); \
       ENAME##_list_erase(l, e); return e; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_begin(const struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_begin(const struct ENAME##_list *l) \
     { return l->_head; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_end(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_end(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
     { return NULL; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_next(const struct ENAME *i) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_next(const struct ENAME *i) \
     { return i->_next; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_last(const struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_last(const struct ENAME##_list *l) \
     { return l->_tail; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_alpha(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_alpha(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
     { return NULL; } \
-  /** @internal */static inline struct ENAME *ENAME##_list_prev(const struct ENAME *i) \
+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_prev(const struct ENAME *i) \
     { return i->_prev; } \
-  /** @internal */static inline int ENAME##_list_ismember(const struct ENAME##_list *l, const struct ENAME *e) \
+  /** @internal */LIST_INLINE int ENAME##_list_ismember(const struct ENAME##_list *l, const struct ENAME *e) \
     { struct ENAME *i=l->_head; while(i!=NULL){ if (i == e) return 1; i=i->_next; } return 0; } \
-  /** @internal */static inline int ENAME##_list_member(const struct ENAME##_list *l, const struct ENAME *e) \
+  /** @internal */LIST_INLINE int ENAME##_list_member(const struct ENAME##_list *l, const struct ENAME *e) \
     { struct ENAME *i=l->_head; int k=0; while(i!=NULL){if (i == e) return k; k++; i=i->_next; } return -1; } \
-  /** @internal */static inline int ENAME##_list_size(const struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE int ENAME##_list_size(const struct ENAME##_list *l) \
     { struct ENAME *i=l->_head; int k=0; while(i!=NULL){k++;i=i->_next;} return k; } \
-  /** @internal */static inline int ENAME##_list_check(const struct ENAME##_list *l) \
+  /** @internal */LIST_INLINE int ENAME##_list_check(const struct ENAME##_list *l) \
     { struct ENAME *i=l->_head; while(i) \
     { if ((i->_next == NULL) && i != l->_tail) return 0; \
       if (i->_next == i) return 0; \
@@ -247,18 +251,18 @@ struct ENAME##_multilist_##MEMBER { \
 /* Create the inlines */
 #define MULTILIST_CREATE_INLINES(TYPE, ENAME, MEMBER) \
 /* Cast from list element to real type.  */ \
-static inline TYPE *ENAME##_of_multilist_##MEMBER(struct ENAME##_multilist_##MEMBER *elt) { \
+LIST_INLINE TYPE *ENAME##_of_multilist_##MEMBER(struct ENAME##_multilist_##MEMBER *elt) { \
 	return ((TYPE *) ((uintptr_t) (elt) - ((uintptr_t) (&((TYPE *) 0)->MEMBER)))); \
 } \
 \
 /* Initialize a list head.  */ \
-static inline void ENAME##_multilist_init_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
+LIST_INLINE void ENAME##_multilist_init_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
 	head->next = head; \
 	head->prev = head; \
 } \
 \
 /* Push element to head of a list.  */ \
-static inline void ENAME##_multilist_push_front_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
+LIST_INLINE void ENAME##_multilist_push_front_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
 	STARPU_ASSERT_MULTILIST(e->MEMBER.prev == NULL); \
 	STARPU_ASSERT_MULTILIST(e->MEMBER.next == NULL); \
 	e->MEMBER.next = head->next; \
@@ -268,7 +272,7 @@ static inline void ENAME##_multilist_push_front_##MEMBER(struct ENAME##_multilis
 } \
 \
 /* Push element to tail of a list.  */ \
-static inline void ENAME##_multilist_push_back_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
+LIST_INLINE void ENAME##_multilist_push_back_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
 	STARPU_ASSERT_MULTILIST(e->MEMBER.prev == NULL); \
 	STARPU_ASSERT_MULTILIST(e->MEMBER.next == NULL); \
 	e->MEMBER.prev = head->prev; \
@@ -278,7 +282,7 @@ static inline void ENAME##_multilist_push_back_##MEMBER(struct ENAME##_multilist
 } \
 \
 /* Erase element from a list.  */ \
-static inline void ENAME##_multilist_erase_##MEMBER(struct ENAME##_multilist_##MEMBER *head STARPU_ATTRIBUTE_UNUSED, TYPE *e) { \
+LIST_INLINE void ENAME##_multilist_erase_##MEMBER(struct ENAME##_multilist_##MEMBER *head STARPU_ATTRIBUTE_UNUSED, TYPE *e) { \
 	STARPU_ASSERT_MULTILIST(e->MEMBER.next->prev == &e->MEMBER); \
 	e->MEMBER.next->prev = e->MEMBER.prev; \
 	STARPU_ASSERT_MULTILIST(e->MEMBER.prev->next == &e->MEMBER); \
@@ -288,30 +292,30 @@ static inline void ENAME##_multilist_erase_##MEMBER(struct ENAME##_multilist_##M
 } \
 \
 /* Test whether the element was queued on the list.  */ \
-static inline int ENAME##_multilist_queued_##MEMBER(TYPE *e) { \
+LIST_INLINE int ENAME##_multilist_queued_##MEMBER(TYPE *e) { \
 	return ((e)->MEMBER.next != NULL); \
 } \
 \
 /* Test whether the list is empty.  */ \
-static inline int ENAME##_multilist_empty_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
+LIST_INLINE int ENAME##_multilist_empty_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
 	return head->next == head; \
 } \
 \
 /* Return the first element of the list.  */ \
-static inline TYPE *ENAME##_multilist_begin_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
+LIST_INLINE TYPE *ENAME##_multilist_begin_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
 	return ENAME##_of_multilist_##MEMBER(head->next); \
 } \
 /* Return the value to be tested at the end of the list.  */ \
-static inline TYPE *ENAME##_multilist_end_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
+LIST_INLINE TYPE *ENAME##_multilist_end_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
 	return ENAME##_of_multilist_##MEMBER(head); \
 } \
 /* Return the next element of the list.  */ \
-static inline TYPE *ENAME##_multilist_next_##MEMBER(TYPE *e) { \
+LIST_INLINE TYPE *ENAME##_multilist_next_##MEMBER(TYPE *e) { \
 	return ENAME##_of_multilist_##MEMBER(e->MEMBER.next); \
 } \
 \
  /* Move a list from its head to another head.  */ \
-static inline void ENAME##_multilist_move_##MEMBER(struct ENAME##_multilist_##MEMBER *head, struct ENAME##_multilist_##MEMBER *newhead) { \
+LIST_INLINE void ENAME##_multilist_move_##MEMBER(struct ENAME##_multilist_##MEMBER *head, struct ENAME##_multilist_##MEMBER *newhead) { \
 	if (ENAME##_multilist_empty_##MEMBER(head)) \
 		ENAME##_multilist_init_##MEMBER(newhead); \
 	else { \

+ 45 - 41
src/common/prio_list.h

@@ -66,6 +66,10 @@
 
 #include <common/rbtree.h>
 
+#ifndef PRIO_LIST_INLINE
+#define PRIO_LIST_INLINE static inline
+#endif
+
 #define PRIO_LIST_TYPE(ENAME, PRIOFIELD) \
 	PRIO_LIST_CREATE_TYPE(ENAME, PRIOFIELD)
 
@@ -83,22 +87,22 @@
 		int prio; \
 		struct ENAME##_list list; \
 	}; \
-	static inline struct ENAME##_prio_list_stage *ENAME##_node_to_list_stage(struct starpu_rbtree_node *node) \
+	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_node_to_list_stage(struct starpu_rbtree_node *node) \
 	{ \
 		/* This assumes node is first member of stage */ \
 		return (struct ENAME##_prio_list_stage *) node; \
 	} \
-	static inline const struct ENAME##_prio_list_stage *ENAME##_node_to_list_stage_const(const struct starpu_rbtree_node *node) \
+	PRIO_LIST_INLINE const struct ENAME##_prio_list_stage *ENAME##_node_to_list_stage_const(const struct starpu_rbtree_node *node) \
 	{ \
 		/* This assumes node is first member of stage */ \
 		return (struct ENAME##_prio_list_stage *) node; \
 	} \
-	static inline void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
 	{ \
 		starpu_rbtree_init(&priolist->tree); \
 		priolist->empty = 1; \
 	} \
-	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
 	{ \
 		if (starpu_rbtree_empty(&priolist->tree)) \
 			return; \
@@ -109,13 +113,13 @@
 		starpu_rbtree_remove(&priolist->tree, root); \
 		free(stage); \
 	} \
-	static inline int ENAME##_prio_list_cmp_fn(int prio, const struct starpu_rbtree_node *node) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_cmp_fn(int prio, const struct starpu_rbtree_node *node) \
 	{ \
 		/* Sort by decreasing order */ \
 		const struct ENAME##_prio_list_stage *e2 = ENAME##_node_to_list_stage_const(node); \
 		return (e2->prio - prio); \
 	} \
-	static inline struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
+	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
 	{ \
 		unsigned long slot; \
 		struct starpu_rbtree_node *node; \
@@ -132,25 +136,25 @@
 		} \
 		return stage; \
 	} \
-	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	{ \
 		struct ENAME##_prio_list_stage *stage = ENAME##_prio_list_add(priolist, e->PRIOFIELD); \
 		ENAME##_list_push_back(&stage->list, e); \
 		priolist->empty = 0; \
 	} \
-	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	{ \
 		struct ENAME##_prio_list_stage *stage = ENAME##_prio_list_add(priolist, e->PRIOFIELD); \
 		ENAME##_list_push_front(&stage->list, e); \
 		priolist->empty = 0; \
 	} \
-	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
 	{ \
 		return priolist->empty; \
 	} \
 	/* Version of list_empty which does not use the cached empty flag,
 	 * typically used to compute the value of the flag */ \
-	static inline int ENAME##_prio_list_empty_slow(const struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_empty_slow(const struct ENAME##_prio_list *priolist) \
 	{ \
 		if (starpu_rbtree_empty(&priolist->tree)) \
 			return 1; \
@@ -161,7 +165,7 @@
 			return 1; \
 		return 0; \
 	} \
-	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	{ \
 		struct starpu_rbtree_node *node = starpu_rbtree_lookup(&priolist->tree, e->PRIOFIELD, ENAME##_prio_list_cmp_fn); \
 		struct ENAME##_prio_list_stage *stage = ENAME##_node_to_list_stage(node); \
@@ -176,7 +180,7 @@
 			priolist->empty = ENAME##_prio_list_empty_slow(priolist); \
 		} \
 	} \
-	static inline int ENAME##_prio_list_get_next_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node *node, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_get_next_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node *node, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
 	{ \
 		struct ENAME##_prio_list_stage *stage; \
 		while(1) { \
@@ -201,12 +205,12 @@
 		*pstage = stage; \
 		return 1; \
 	} \
-	static inline int ENAME##_prio_list_get_first_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_get_first_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
 	{ \
 		struct starpu_rbtree_node *node = starpu_rbtree_first(&priolist->tree); \
 		return ENAME##_prio_list_get_next_nonempty_stage(priolist, node, pnode, pstage); \
 	} \
-	static inline struct ENAME *ENAME##_prio_list_pop_front(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_pop_front(struct ENAME##_prio_list *priolist) \
 	{ \
 		struct starpu_rbtree_node *node; \
 		struct ENAME##_prio_list_stage *stage; \
@@ -225,7 +229,7 @@
 		} \
 		return ret; \
 	} \
-	static inline int ENAME##_prio_list_get_prev_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node *node, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_get_prev_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node *node, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
 	{ \
 		struct ENAME##_prio_list_stage *stage; \
 		while(1) { \
@@ -250,12 +254,12 @@
 		*pstage = stage; \
 		return 1; \
 	} \
-	static inline int ENAME##_prio_list_get_last_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_get_last_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
 	{ \
 		struct starpu_rbtree_node *node = starpu_rbtree_last(&priolist->tree); \
 		return ENAME##_prio_list_get_prev_nonempty_stage(priolist, node, pnode, pstage); \
 	} \
-	static inline struct ENAME *ENAME##_prio_list_pop_back(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_pop_back(struct ENAME##_prio_list *priolist) \
 	{ \
 		struct starpu_rbtree_node *node; \
 		struct ENAME##_prio_list_stage *stage; \
@@ -274,7 +278,7 @@
 		} \
 		return ret; \
 	} \
-	static inline void ENAME##_prio_list_push_prio_list_back(struct ENAME##_prio_list *priolist, struct ENAME##_prio_list *priolist_toadd) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_push_prio_list_back(struct ENAME##_prio_list *priolist, struct ENAME##_prio_list *priolist_toadd) \
 	{ \
 		struct starpu_rbtree_node *node_toadd, *tmp; \
 		starpu_rbtree_for_each_remove(&priolist_toadd->tree, node_toadd, tmp) { \
@@ -306,7 +310,7 @@
 			} \
 		} \
 	} \
-	static inline int ENAME##_prio_list_ismember(const struct ENAME##_prio_list *priolist, const struct ENAME *e) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_ismember(const struct ENAME##_prio_list *priolist, const struct ENAME *e) \
 	{ \
 		struct starpu_rbtree_node *node = starpu_rbtree_lookup(&priolist->tree, e->PRIOFIELD, ENAME##_prio_list_cmp_fn); \
 		if (node) { \
@@ -315,7 +319,7 @@
 		} \
 		return 0; \
 	} \
-	static inline struct ENAME *ENAME##_prio_list_begin(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_begin(struct ENAME##_prio_list *priolist) \
 	{ \
 		struct starpu_rbtree_node *node; \
 		struct ENAME##_prio_list_stage *stage; \
@@ -323,9 +327,9 @@
 			return NULL; \
 		return ENAME##_list_begin(&stage->list); \
 	} \
-	static inline struct ENAME *ENAME##_prio_list_end(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_end(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED) \
 	{ return NULL; } \
-	static inline struct ENAME *ENAME##_prio_list_next(struct ENAME##_prio_list *priolist, const struct ENAME *i) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_next(struct ENAME##_prio_list *priolist, const struct ENAME *i) \
 	{ \
 		struct ENAME *next = ENAME##_list_next(i); \
 		if (next != ENAME##_list_end(NULL)) \
@@ -337,7 +341,7 @@
 			return NULL; \
 		return ENAME##_list_begin(&stage->list); \
 	} \
-	static inline struct ENAME *ENAME##_prio_list_last(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_last(struct ENAME##_prio_list *priolist) \
 	{ \
 		struct starpu_rbtree_node *node; \
 		struct ENAME##_prio_list_stage *stage; \
@@ -345,9 +349,9 @@
 			return NULL; \
 		return ENAME##_list_last(&stage->list); \
 	} \
-	static inline struct ENAME *ENAME##_prio_list_alpha(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_alpha(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED) \
 	{ return NULL; } \
-	static inline struct ENAME *ENAME##_prio_list_prev(struct ENAME##_prio_list *priolist, const struct ENAME *i) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_prev(struct ENAME##_prio_list *priolist, const struct ENAME *i) \
 	{ \
 		struct ENAME *next = ENAME##_list_prev(i); \
 		if (next != ENAME##_list_alpha(NULL)) \
@@ -365,11 +369,11 @@
 /* gdbinit can't recurse in a tree. Use a mere list in debugging mode.  */
 #define PRIO_LIST_CREATE_TYPE(ENAME, PRIOFIELD) \
 	struct ENAME##_prio_list { struct ENAME##_list list; }; \
-	static inline void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
 	{ ENAME##_list_init(&(priolist)->list); } \
-	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
 	{ (void) (priolist); /* ENAME##_list_deinit(&(priolist)->list); */ } \
-	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	{ \
 		struct ENAME *cur; \
 		for (cur  = ENAME##_list_begin(&(priolist)->list); \
@@ -382,7 +386,7 @@
 		else \
 			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
 	} \
-	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	{ \
 		struct ENAME *cur; \
 		for (cur  = ENAME##_list_begin(&(priolist)->list); \
@@ -395,29 +399,29 @@
 		else \
 			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
 	} \
-	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_empty(&(priolist)->list); } \
-	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	{ ENAME##_list_erase(&(priolist)->list, (e)); } \
-	static inline struct ENAME *ENAME##_prio_list_pop_front(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_pop_front(struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_pop_front(&(priolist)->list); } \
-	static inline struct ENAME *ENAME##_prio_list_pop_back(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_pop_back(struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_pop_back(&(priolist)->list); } \
-	static inline void ENAME##_prio_list_push_prio_list_back(struct ENAME##_prio_list *priolist, struct ENAME##_prio_list *priolist_toadd) \
+	PRIO_LIST_INLINE void ENAME##_prio_list_push_prio_list_back(struct ENAME##_prio_list *priolist, struct ENAME##_prio_list *priolist_toadd) \
 	{ ENAME##_list_push_list_back(&(priolist)->list, &(priolist_toadd)->list); } \
-	static inline int ENAME##_prio_list_ismember(const struct ENAME##_prio_list *priolist, const struct ENAME *e) \
+	PRIO_LIST_INLINE int ENAME##_prio_list_ismember(const struct ENAME##_prio_list *priolist, const struct ENAME *e) \
 	{ return ENAME##_list_ismember(&(priolist)->list, (e)); } \
-	static inline struct ENAME *ENAME##_prio_list_begin(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_begin(struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_begin(&(priolist)->list); } \
-	static inline struct ENAME *ENAME##_prio_list_end(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_end(struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_end(&(priolist)->list); } \
-	static inline struct ENAME *ENAME##_prio_list_next(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED, const struct ENAME *i) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_next(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED, const struct ENAME *i) \
 	{ return ENAME##_list_next(i); } \
-	static inline struct ENAME *ENAME##_prio_list_last(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_last(struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_last(&(priolist)->list); } \
-	static inline struct ENAME *ENAME##_prio_list_alpha(struct ENAME##_prio_list *priolist) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_alpha(struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_alpha(&(priolist)->list); } \
-	static inline struct ENAME *ENAME##_prio_list_prev(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED, const struct ENAME *i) \
+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_prev(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED, const struct ENAME *i) \
 	{ return ENAME##_list_prev(i); } \
 
 #endif

+ 0 - 1
src/core/task.h

@@ -139,7 +139,6 @@ int _starpu_task_wait_for_all_in_ctx_and_return_nb_waited_tasks(unsigned sched_c
 #ifdef BUILDING_STARPU
 LIST_CREATE_TYPE_NOSTRUCT(starpu_task, prev, next);
 PRIO_LIST_CREATE_TYPE(starpu_task, priority);
-#define __STARPU_TASK_LIST_H__
 #endif
 
 #endif // __CORE_TASK_H__

+ 2 - 2
src/datawizard/copy_driver.c

@@ -533,7 +533,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
 			if (ret == 0)
 				/* write is already finished, ptr was allocated in pack_data */
-				free(ptr);
+				starpu_free_flags(ptr, size, 0);
 
 			/* For now, asynchronous is not supported */
 			STARPU_ASSERT(ret == 0);
@@ -554,7 +554,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 				/* read is already finished, we can already unpack */
 				handle->ops->unpack_data(handle, dst_node, ptr, size);
 				/* ptr is allocated in full_read */
-				free(ptr);
+				starpu_free_flags(ptr, size, 0);
 			}
 
 			/* For now, asynchronous is not supported */

+ 75 - 0
src/drivers/cuda/starpu_cusparse.c

@@ -0,0 +1,75 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012, 2014, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/config.h>
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include <core/workers.h>
+
+#ifdef HAVE_LIBCUSPARSE
+#include <cusparse.h>
+
+static cusparseHandle_t cusparse_handles[STARPU_NMAXWORKERS];
+static cusparseHandle_t main_handle;
+
+static void init_cusparse_func(void *args STARPU_ATTRIBUTE_UNUSED)
+{
+	cusparseCreate(&cusparse_handles[starpu_worker_get_id_check()]);
+#if HAVE_DECL_CUSPARSESETSTREAM
+	cusparseSetStream(cusparse_handles[starpu_worker_get_id_check()], starpu_cuda_get_local_stream());
+#else
+	cusparseSetKernelStream(cusparse_handles[starpu_worker_get_id_check()], starpu_cuda_get_local_stream());
+#endif
+}
+
+static void shutdown_cusparse_func(void *args STARPU_ATTRIBUTE_UNUSED)
+{
+	cusparseDestroy(cusparse_handles[starpu_worker_get_id_check()]);
+}
+#endif
+
+void starpu_cusparse_init(void)
+{
+#ifdef HAVE_LIBCUSPARSE
+	starpu_execute_on_each_worker(init_cusparse_func, NULL, STARPU_CUDA);
+
+	if (cusparseCreate(&main_handle) != CUSPARSE_STATUS_SUCCESS)
+		main_handle = NULL;
+#endif
+}
+
+void starpu_cusparse_shutdown(void)
+{
+#ifdef HAVE_LIBCUSPARSE
+	starpu_execute_on_each_worker(shutdown_cusparse_func, NULL, STARPU_CUDA);
+
+	if (main_handle)
+		cusparseDestroy(main_handle);
+#endif
+}
+
+#ifdef HAVE_LIBCUSPARSE
+cusparseHandle_t starpu_cusparse_get_local_handle(void)
+{
+	int workerid = starpu_worker_get_id();
+	if (workerid >= 0)
+		return cusparse_handles[workerid];
+	else
+		return main_handle;
+}
+#endif

+ 5 - 27
src/sched_policies/component_fifo.c

@@ -172,18 +172,8 @@ static struct starpu_task * fifo_pull_task(struct starpu_sched_component * compo
 
 	// When a pop is called, a can_push is called for pushing tasks onto
 	// the empty place of the queue left by the popped task.
-	int i,ret;
-	for(i=0; i < component->nparents; i++)
-	{
-		if(component->parents[i] == NULL)
-			continue;
-		else
-		{
-			ret = component->parents[i]->can_push(component->parents[i]);
-			if(ret)
-				break;
-		}
-	}
+
+	starpu_sched_component_send_can_push_to_parents(component); 
 
 	if(task)
 		return task;
@@ -199,24 +189,12 @@ static struct starpu_task * fifo_pull_task(struct starpu_sched_component * compo
 static int fifo_can_push(struct starpu_sched_component * component)
 {
 	STARPU_ASSERT(component && starpu_sched_component_is_fifo(component));
-	int ret = 0;
 	int res = 0;
+	struct starpu_task * task;
 
-	STARPU_ASSERT(component->nchildren == 1);
-	struct starpu_sched_component * child = component->children[0];
-	struct starpu_task * task = NULL;
+	task = starpu_sched_component_pump_downstream(component, &res); 
 
-	while (1)
-	{
-		task = starpu_sched_component_pull_task(component,component);
-		if (!task)
-			break;
-		ret = starpu_sched_component_push_task(component,child,task);
-		if (ret)
-			break;
-		res = 1;
-	}
-	if(task && ret)
+	if(task)
 		fifo_push_local_task(component,task,1);
 
 	return res;

+ 7 - 2
src/sched_policies/component_heft.c

@@ -90,12 +90,17 @@ static int heft_progress_one(struct starpu_sched_component *component)
 			min_exp_end_with_task[n] = DBL_MAX;
 			max_exp_end_with_task[n] = 0.0;
 
-			nsuitable_components[n] = starpu_mct_compute_expected_times(component, tasks[n],
+			nsuitable_components[n] = starpu_mct_compute_execution_times(component, tasks[n],
+					estimated_lengths + offset,
+					estimated_transfer_length + offset,
+					suitable_components + offset);
+
+			starpu_mct_compute_expected_times(component, tasks[n],
 					estimated_lengths + offset,
 					estimated_transfer_length + offset,
 					estimated_ends_with_task + offset,
 					&min_exp_end_with_task[n], &max_exp_end_with_task[n],
-					suitable_components + offset);
+							  suitable_components + offset, nsuitable_components[n]);
 		}
 
 		int best_task = 0;

+ 24 - 5
src/sched_policies/component_mct.c

@@ -53,9 +53,8 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	int suitable_components[component->nchildren];
 	int nsuitable_components = 0;
 
-	nsuitable_components = starpu_mct_compute_expected_times(component, task,
-			estimated_lengths, estimated_transfer_length, estimated_ends_with_task,
-			&min_exp_end_with_task, &max_exp_end_with_task, suitable_components);
+	nsuitable_components = starpu_mct_compute_execution_times(component, task,
+								  estimated_lengths, estimated_transfer_length, suitable_components);
 
 	/* If no suitable components were found, it means that the perfmodel of
 	 * the task had been purged since it has been pushed on the mct component.
@@ -64,6 +63,15 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	if(nsuitable_components == 0)
 		return 1;
 
+
+	/* Entering critical section to make sure no two workers
+	   make scheduling decisions at the same time */
+	STARPU_PTHREAD_MUTEX_LOCK(&d->scheduling_mutex); 
+
+
+	starpu_mct_compute_expected_times(component, task, estimated_lengths, estimated_transfer_length, 
+					  estimated_ends_with_task, &min_exp_end_with_task, &max_exp_end_with_task, suitable_components, nsuitable_components);
+
 	double best_fitness = DBL_MAX;
 	int best_icomponent = -1;
 	for(i = 0; i < nsuitable_components; i++)
@@ -73,7 +81,7 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 #warning FIXME: take energy consumption into account
 #endif
 		double tmp = starpu_mct_compute_fitness(d,
-					     estimated_ends_with_task[icomponent] - estimated_transfer_length[icomponent],
+					     estimated_ends_with_task[icomponent],
 					     min_exp_end_with_task,
 					     max_exp_end_with_task,
 					     estimated_transfer_length[icomponent],
@@ -90,8 +98,10 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	 * the task had been purged since it has been pushed on the mct component.
 	 * We should send a push_fail message to its parent so that it will
 	 * be able to reschedule the task properly. */
-	if(best_icomponent == -1)
+	if(best_icomponent == -1) {
+		STARPU_PTHREAD_MUTEX_UNLOCK(&d->scheduling_mutex); 
 		return 1;
+	}
 
 	best_component = component->children[best_icomponent];
 
@@ -101,11 +111,18 @@ static int mct_push_task(struct starpu_sched_component * component, struct starp
 	if(starpu_sched_component_is_worker(best_component))
 	{
 		best_component->can_pull(best_component);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&d->scheduling_mutex); 
+		
 		return 1;
 	}
 
 	_STARPU_TASK_BREAK_ON(task, sched);
 	int ret = starpu_sched_component_push_task(component, best_component, task);
+	
+	/* I can now exit the critical section: Pushing the task below ensures that its execution 
+	   time will be taken into account for subsequent scheduling decisions */
+	STARPU_PTHREAD_MUTEX_UNLOCK(&d->scheduling_mutex); 
+
 	return ret;
 }
 
@@ -113,6 +130,7 @@ static void mct_component_deinit_data(struct starpu_sched_component * component)
 {
 	STARPU_ASSERT(starpu_sched_component_is_mct(component));
 	struct _starpu_mct_data * d = component->data;
+	STARPU_PTHREAD_MUTEX_DESTROY(&d->scheduling_mutex); 
 	free(d);
 }
 
@@ -127,6 +145,7 @@ struct starpu_sched_component * starpu_sched_component_mct_create(struct starpu_
 	struct _starpu_mct_data *data = starpu_mct_init_parameters(params);
 
 	component->data = data;
+	STARPU_PTHREAD_MUTEX_INIT(&data->scheduling_mutex, NULL); 
 
 	component->push_task = mct_push_task;
 	component->deinit_data = mct_component_deinit_data;

+ 1 - 0
src/sched_policies/component_perfmodel_select.c

@@ -88,6 +88,7 @@ struct starpu_sched_component * starpu_sched_component_perfmodel_select_create(s
 	data->perfmodel_component = params->perfmodel_component;
 
 	component->data = data;
+	component->can_pull = starpu_sched_component_send_can_push_to_parents; 
 	component->push_task = perfmodel_select_push_task;
 	component->deinit_data = perfmodel_select_component_deinit_data;
 	component->estimated_end = starpu_sched_component_estimated_end_min;

+ 23 - 29
src/sched_policies/component_prio.c

@@ -142,6 +142,16 @@ static int prio_push_local_task(struct starpu_sched_component * component, struc
 			starpu_sched_component_prefetch_on_node(component, task);
 			STARPU_TRACE_SCHED_COMPONENT_PUSH_PRIO(component, prio->ntasks, exp_len);
 		}
+		
+		if(!isnan(task->predicted_transfer)) {
+			double end = prio_estimated_end(component); 
+			double tfer_end = starpu_timing_now() + task->predicted_transfer; 
+			if(tfer_end < end) 
+				task->predicted_transfer = 0.0; 
+			else 
+				task->predicted_transfer = tfer_end - end; 
+			exp_len += task->predicted_transfer; 
+		}
 
 		if(!isnan(task->predicted))
 		{
@@ -152,9 +162,9 @@ static int prio_push_local_task(struct starpu_sched_component * component, struc
 		STARPU_ASSERT(!isnan(prio->exp_len));
 		STARPU_ASSERT(!isnan(prio->exp_start));
 		
+		STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
 		if(!is_pushback)
 			component->can_pull(component);
-		STARPU_COMPONENT_MUTEX_UNLOCK(mutex);
 	}
 
 	return ret;
@@ -192,6 +202,12 @@ static struct starpu_task * prio_pull_task(struct starpu_sched_component * compo
 			}
 		}
 		STARPU_ASSERT_MSG(prio->exp_len>=0, "prio->exp_len=%lf\n",prio->exp_len);
+		if(!isnan(task->predicted_transfer)){
+			prio->exp_start += task->predicted_transfer; 
+			prio->exp_len -= task->predicted_transfer; 
+			STARPU_ASSERT_MSG(prio->exp_len>=0, "prio->exp_len=%lf\n",prio->exp_len);
+		}
+
 		prio->exp_end = prio->exp_start + prio->exp_len;
 		if(prio->ntasks == 0)
 			prio->exp_len = 0.0;
@@ -205,18 +221,8 @@ static struct starpu_task * prio_pull_task(struct starpu_sched_component * compo
 
 	// When a pop is called, a can_push is called for pushing tasks onto
 	// the empty place of the queue left by the popped task.
-	int i,ret;
-	for(i=0; i < component->nparents; i++)
-	{
-		if(component->parents[i] == NULL)
-			continue;
-		else
-		{
-			ret = component->parents[i]->can_push(component->parents[i]);
-			if(ret)
-				break;
-		}
-	}
+
+	starpu_sched_component_send_can_push_to_parents(component); 
 	
 	if(task)
 		return task;
@@ -232,25 +238,13 @@ static struct starpu_task * prio_pull_task(struct starpu_sched_component * compo
 static int prio_can_push(struct starpu_sched_component * component)
 {
 	STARPU_ASSERT(component && starpu_sched_component_is_prio(component));
-	int ret = 0;
 	int res = 0;
-
-	STARPU_ASSERT(component->nchildren == 1);
-	struct starpu_sched_component * child = component->children[0];
 	struct starpu_task * task;
 
-	while (1)
-	{
-		task = starpu_sched_component_pull_task(component,component);
-		if (!task)
-			break;
-		ret = starpu_sched_component_push_task(component,child,task);	
-		if (ret)
-			break;
-		res = 1;
-	}
-	if(task && ret)
-		prio_push_local_task(component,task,1);
+	task = starpu_sched_component_pump_downstream(component, &res); 
+
+	if(task)
+		STARPU_ASSERT(!prio_push_local_task(component,task,1));
 
 	return res;
 }

+ 57 - 1
src/sched_policies/component_sched.c

@@ -263,8 +263,8 @@ void _starpu_sched_component_update_workers(struct starpu_sched_component * comp
 	{
 		_starpu_sched_component_update_workers(component->children[i]);
 		starpu_bitmap_or(component->workers, component->children[i]->workers);
-		component->notify_change_workers(component);
 	}
+	component->notify_change_workers(component);
 }
 
 /* recursively set the component->workers_in_ctx in component's subtree
@@ -283,6 +283,7 @@ void _starpu_sched_component_update_workers_in_ctx(struct starpu_sched_component
 	{
 		struct starpu_sched_component * child = component->children[i];
 		_starpu_sched_component_update_workers_in_ctx(child, sched_ctx_id);
+		starpu_bitmap_or(component->workers_in_ctx, child->workers_in_ctx);
 	}
 	set_properties(component);
 	component->notify_change_workers(component);
@@ -366,6 +367,35 @@ struct starpu_task * starpu_sched_component_pull_task(struct starpu_sched_compon
 	return task;
 }
 
+
+/* Pump mechanic to get the task flow rolling. Takes tasks from component and send them to the child.
+   To be used by components with only one child */
+struct starpu_task* starpu_sched_component_pump_downstream(struct starpu_sched_component *component, int* success) 
+{
+	int ret = 0;
+	
+	STARPU_ASSERT(component->nchildren == 1);
+	struct starpu_sched_component * child = component->children[0];
+	struct starpu_task * task;
+
+	while (1)
+	{
+		task = starpu_sched_component_pull_task(component,component);
+		if (!task)
+			break;
+		ret = starpu_sched_component_push_task(component,child,task);	
+		if (ret)
+			break;
+		if(success) 
+			* success = 1; 
+	}
+	if(task && ret)
+		return task;
+
+	return NULL;
+	
+}
+
 void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
 {
 	STARPU_ASSERT(sched_ctx_id < STARPU_NMAX_SCHED_CTXS);
@@ -555,6 +585,32 @@ static void starpu_sched_component_can_pull(struct starpu_sched_component * comp
 		component->children[i]->can_pull(component->children[i]);
 }
 
+
+/* Alternative can_pull which says that this component does not want
+   to pull but prefers that you push. It can be used by decision
+   components, in which decisions are usually taken in their push()
+   functions */
+void starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component)
+{
+	STARPU_ASSERT(component);
+	STARPU_ASSERT(!starpu_sched_component_is_worker(component));
+	
+	int i,ret;
+	for(i=0; i < component->nparents; i++)
+	{
+		if(component->parents[i] == NULL)
+			continue;
+		else
+		{
+			ret = component->parents[i]->can_push(component->parents[i]);
+			if(ret)
+				break;
+		}
+	}
+	
+}
+
+
 double starpu_sched_component_estimated_load(struct starpu_sched_component * component)
 {
 	double sum = 0.0;

+ 3 - 3
src/sched_policies/component_worker.c

@@ -502,10 +502,10 @@ static double simple_worker_estimated_end(struct starpu_sched_component * compon
 {
 	struct _starpu_worker_component_data * data = component->data;
 	double now = starpu_timing_now();
-	if (now > data->list->exp_start)
+	if (now + data->list->pipeline_len > data->list->exp_start )
 	{
-		data->list->exp_start = now;
-		data->list->exp_end = now + data->list->exp_len;
+		data->list->exp_start = now + data->list->pipeline_len;
+		data->list->exp_end = data->list->exp_start + data->list->exp_len;
 	}
 	return data->list->exp_end;
 }

+ 36 - 23
src/sched_policies/helper_mct.c

@@ -82,26 +82,28 @@ struct _starpu_mct_data *starpu_mct_init_parameters(struct starpu_sched_componen
 
 /* compute predicted_end by taking into account the case of the predicted transfer and the predicted_end overlap
  */
-static double compute_expected_time(double now, double predicted_end, double predicted_length, double *predicted_transfer)
+static double compute_expected_time(double now, double predicted_end, double predicted_length, double predicted_transfer)
 {
-	STARPU_ASSERT(!isnan(now + predicted_end + predicted_length + *predicted_transfer));
-	STARPU_ASSERT_MSG(now >= 0.0 && predicted_end >= 0.0 && predicted_length >= 0.0 && *predicted_transfer >= 0.0, "now=%lf, predicted_end=%lf, predicted_length=%lf, *predicted_transfer=%lf\n", now, predicted_end, predicted_length, *predicted_transfer);
+	STARPU_ASSERT(!isnan(now + predicted_end + predicted_length + predicted_transfer));
+	STARPU_ASSERT_MSG(now >= 0.0 && predicted_end >= 0.0 && predicted_length >= 0.0 && predicted_transfer >= 0.0, "now=%lf, predicted_end=%lf, predicted_length=%lf, predicted_transfer=%lf\n", now, predicted_end, predicted_length, predicted_transfer);
 
 	/* TODO: actually schedule transfers */
-	if (now + *predicted_transfer < predicted_end)
+	/* Compute the transfer time which will not be overlapped */
+	/* However, no modification in calling function so that the whole transfer time is counted as a penalty */
+	if (now + predicted_transfer < predicted_end)
 	{
 		/* We may hope that the transfer will be finished by
 		 * the start of the task. */
-		*predicted_transfer = 0;
+		predicted_transfer = 0;
 	}
 	else
 	{
 		/* The transfer will not be finished by then, take the
 		 * remainder into account */
-		*predicted_transfer -= (predicted_end - now);
+		predicted_transfer -= (predicted_end - now);
 	}
 
-	predicted_end += *predicted_transfer;
+	predicted_end += predicted_transfer;
 	predicted_end += predicted_length;
 
 	return predicted_end;
@@ -117,12 +119,10 @@ double starpu_mct_compute_fitness(struct _starpu_mct_data * d, double exp_end, d
 		+ d->_gamma * d->idle_power * (exp_end - max_exp_end);
 }
 
-int starpu_mct_compute_expected_times(struct starpu_sched_component *component, struct starpu_task *task,
-		double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task,
-		double *min_exp_end_with_task, double *max_exp_end_with_task, int *suitable_components)
+int starpu_mct_compute_execution_times(struct starpu_sched_component *component, struct starpu_task *task,
+				       double *estimated_lengths, double *estimated_transfer_length, int *suitable_components) 
 {
 	int nsuitable_components = 0;
-	double now = starpu_timing_now();
 
 	int i;
 	for(i = 0; i < component->nchildren; i++)
@@ -136,21 +136,34 @@ int starpu_mct_compute_expected_times(struct starpu_sched_component *component,
 				continue;
 			STARPU_ASSERT_MSG(estimated_lengths[i]>=0, "component=%p, child[%d]=%p, estimated_lengths[%d]=%lf\n", component, i, c, i, estimated_lengths[i]);
 
-			/* Estimated availability of worker */
-			double estimated_end = c->estimated_end(c);
-			if (estimated_end < now)
-				estimated_end = now;
 			estimated_transfer_length[i] = starpu_sched_component_transfer_length(c, task);
-			estimated_ends_with_task[i] = compute_expected_time(now,
-									    estimated_end,
-									    estimated_lengths[i],
-									    &estimated_transfer_length[i]);
-			if(estimated_ends_with_task[i] < *min_exp_end_with_task)
-				*min_exp_end_with_task = estimated_ends_with_task[i];
-			if(estimated_ends_with_task[i] > *max_exp_end_with_task)
-				*max_exp_end_with_task = estimated_ends_with_task[i];
 			suitable_components[nsuitable_components++] = i;
 		}
 	}
 	return nsuitable_components;
 }
+
+void starpu_mct_compute_expected_times(struct starpu_sched_component *component, struct starpu_task *task,
+		double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task,
+				       double *min_exp_end_with_task, double *max_exp_end_with_task, int *suitable_components, int nsuitable_components)
+{
+	int i;
+	double now = starpu_timing_now();
+	for(i = 0; i < nsuitable_components; i++)
+	{
+		int icomponent = suitable_components[i];
+		struct starpu_sched_component * c = component->children[icomponent];
+		/* Estimated availability of worker */
+		double estimated_end = c->estimated_end(c);
+		if (estimated_end < now)
+			estimated_end = now;
+		estimated_ends_with_task[icomponent] = compute_expected_time(now,
+								    estimated_end,
+								    estimated_lengths[icomponent],
+								    estimated_transfer_length[icomponent]);
+		if(estimated_ends_with_task[icomponent] < *min_exp_end_with_task)
+			*min_exp_end_with_task = estimated_ends_with_task[icomponent];
+		if(estimated_ends_with_task[icomponent] > *max_exp_end_with_task)
+			*max_exp_end_with_task = estimated_ends_with_task[icomponent];
+	}
+}

+ 7 - 2
src/sched_policies/helper_mct.h

@@ -20,12 +20,17 @@ struct _starpu_mct_data
 	double beta;
 	double _gamma;
 	double idle_power;
+	starpu_pthread_mutex_t scheduling_mutex; 
 };
 
 struct _starpu_mct_data *starpu_mct_init_parameters(struct starpu_sched_component_mct_data *params);
 
-int starpu_mct_compute_expected_times(struct starpu_sched_component *component, struct starpu_task *task,
+int starpu_mct_compute_execution_times(struct starpu_sched_component *component, struct starpu_task *task,
+				       double *estimated_lengths, double *estimated_transfer_length, int *suitable_components);
+
+
+void starpu_mct_compute_expected_times(struct starpu_sched_component *component, struct starpu_task *task,
 		double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task,
-		double *min_exp_end_with_task, double *max_exp_end_with_task, int *suitable_components);
+				       double *min_exp_end_with_task, double *max_exp_end_with_task, int *suitable_components, int nsuitable_components);
 
 double starpu_mct_compute_fitness(struct _starpu_mct_data * d, double exp_end, double min_exp_end, double max_exp_end, double transfer_len, double local_energy);

+ 5 - 0
src/sched_policies/prio_deque.h

@@ -25,6 +25,11 @@ struct _starpu_prio_deque
 	int size_array;
 	unsigned ntasks;
 	unsigned nprocessed;
+	// Assumptions: 
+	// exp_len is the sum of predicted_length + predicted_tansfer of all tasks in list
+	// exp_start is the time at which the first task of list can start
+	// exp_end = exp_start + exp_end
+	// Careful: those are NOT maintained by the prio_queue operations
 	double exp_start, exp_end, exp_len;
 };
 

+ 1 - 0
tests/Makefile.am

@@ -154,6 +154,7 @@ myPROGRAMS +=					\
 	errorcheck/starpu_init_noworker		\
 	errorcheck/invalid_tasks		\
 	helper/cublas_init			\
+	helper/cusparse_init			\
 	helper/pinned_memory			\
 	helper/execute_on_all			\
 	microbenchs/display_structures_size	\

+ 72 - 0
tests/helper/cusparse_init.c

@@ -0,0 +1,72 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014, 2016-2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <starpu.h>
+#include <stdlib.h>
+#include "../helper.h"
+
+/*
+ * Test initializing cusparse, and how much time that takes
+ */
+
+static double start;
+static double end;
+
+//static float *data = NULL;
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	unsigned ngpus = starpu_cuda_worker_get_count();
+
+	double init_timing;
+	double shutdown_timing;
+
+	start = starpu_timing_now();
+	starpu_cusparse_init();
+	end = starpu_timing_now();
+	init_timing = end - start;
+
+	start = starpu_timing_now();
+	starpu_cusparse_shutdown();
+	end = starpu_timing_now();
+	shutdown_timing = end - start;
+
+	FPRINTF(stderr, "Total:\n");
+	FPRINTF(stderr, "\tinit: %2.2f us\n", init_timing/(1000));
+	FPRINTF(stderr, "\tshutdown: %2.2f us\n", shutdown_timing/(1000));
+
+	if (ngpus != 0)
+	{
+		FPRINTF(stderr, "per-GPU (#gpu = %u):\n", ngpus);
+
+		FPRINTF(stderr, "\tinit: %2.2f us\n", init_timing/(1000*ngpus));
+		FPRINTF(stderr, "\tshutdown: %2.2f us\n", shutdown_timing/(1000*ngpus));
+	}
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}