浏览代码

merge trunk

Nathalie Furmento 10 年之前
父节点
当前提交
f885c06876
共有 70 个文件被更改,包括 918 次插入506 次删除
  1. 11 1
      ChangeLog
  2. 9 1
      Makefile.am
  3. 15 1
      configure.ac
  4. 20 0
      doc/doxygen/chapters/40environment_variables.doxy
  5. 4 0
      doc/doxygen/chapters/api/data_interfaces.doxy
  6. 7 5
      examples/interface/complex_interface.c
  7. 5 2
      examples/pi/SobolQRNG/sobol_gold.c
  8. 1 1
      examples/pi/SobolQRNG/sobol_primitives.c
  9. 2 2
      examples/sched_ctx/sched_ctx_without_sched_policy.c
  10. 2 0
      include/pthread_win32/pthread.h
  11. 0 8
      include/starpu_config.h.in
  12. 2 0
      include/starpu_data_interfaces.h
  13. 4 1
      include/starpu_profiling.h
  14. 2 2
      include/starpu_task.h
  15. 8 8
      include/starpu_thread.h
  16. 20 0
      include/starpu_util.h
  17. 2 2
      mpi/examples/mpi_lu/plu_outofcore_example.c
  18. 1 1
      mpi/src/starpu_mpi.c
  19. 4 4
      mpi/src/starpu_mpi_task_insert.c
  20. 3 3
      socl/src/cl_enqueuecopybuffer.c
  21. 2 2
      socl/src/cl_enqueuereadbuffer.c
  22. 2 2
      socl/src/cl_enqueuewritebuffer.c
  23. 5 1
      src/common/fxt.c
  24. 3 2
      src/common/fxt.h
  25. 8 2
      src/common/thread.c
  26. 34 5
      src/common/timing.c
  27. 5 3
      src/common/timing.h
  28. 92 3
      src/common/utils.c
  29. 9 0
      src/common/utils.h
  30. 1 1
      src/core/combined_workers.c
  31. 8 8
      src/core/disk_ops/disk_leveldb.cpp
  32. 8 9
      src/core/disk_ops/disk_stdio.c
  33. 4 2
      src/core/disk_ops/disk_unistd.c
  34. 4 2
      src/core/disk_ops/disk_unistd_o_direct.c
  35. 12 10
      src/core/disk_ops/unistd/disk_unistd_global.c
  36. 3 1
      src/core/jobs.h
  37. 2 0
      src/core/perfmodel/perfmodel.c
  38. 65 40
      src/core/perfmodel/perfmodel_bus.c
  39. 22 1
      src/core/perfmodel/perfmodel_history.c
  40. 8 2
      src/core/perfmodel/perfmodel_nan.c
  41. 2 2
      src/core/sched_ctx.c
  42. 6 0
      src/core/simgrid.c
  43. 4 0
      src/core/task.c
  44. 2 2
      src/core/topology.c
  45. 6 2
      src/core/workers.c
  46. 8 1
      src/core/workers.h
  47. 33 2
      src/datawizard/coherency.c
  48. 23 23
      src/datawizard/copy_driver.c
  49. 7 3
      src/datawizard/data_request.c
  50. 4 1
      src/datawizard/filters.c
  51. 8 6
      src/datawizard/interfaces/block_interface.c
  52. 4 2
      src/datawizard/interfaces/data_interface.c
  53. 11 123
      src/datawizard/interfaces/matrix_interface.c
  54. 7 7
      src/debug/traces/starpu_fxt.c
  55. 141 79
      src/drivers/cuda/driver_cuda.c
  56. 17 4
      src/drivers/driver_common/driver_common.c
  57. 1 1
      src/drivers/driver_common/driver_common.h
  58. 107 56
      src/drivers/opencl/driver_opencl.c
  59. 5 2
      src/drivers/opencl/driver_opencl_utils.c
  60. 2 2
      src/profiling/bound.c
  61. 2 2
      src/profiling/profiling.h
  62. 9 4
      src/top/starpu_top_connection.c
  63. 0 1
      src/top/starpu_top_task.c
  64. 4 5
      tests/loader.c
  65. 6 0
      tests/main/driver_api/run_driver.c
  66. 34 8
      tests/overlap/gpu_concurrency.c
  67. 41 25
      tools/gdbinit
  68. 1 1
      tools/starpu_calibrate_bus.c
  69. 2 2
      tools/starpu_perfmodel_display.c
  70. 2 2
      tools/starpu_perfmodel_plot.c

+ 11 - 1
ChangeLog

@@ -47,6 +47,8 @@ New features:
     CUDA and OpenCL kernel execution.
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
+  * Add CUDA and OpenCL kernel submission pipelining, to overlap costs and allow
+    concurrent kernel execution on Fermi cards.
   * New locality work stealing scheduler (lws).
   * Add STARPU_VARIABLE_NBUFFERS to be set in cl.nbuffers, and nbuffers and
     modes field to the task structure, which permit to define codelets taking a
@@ -100,8 +102,16 @@ The scheduling context release
 New features:
   * One can register an existing on-GPU buffer to be used by a handle.
   * Add the starpu_paje_summary statistics tool.
+  * Enable gpu-gpu transfers for matrices.
+  * Let interfaces declare which transfers they allow with the can_copy
+    methode.
 
-StarPU 1.1.2 (svn revision xxx)
+Small changes:
+  * Lock performance model files while writing and reading them to avoid
+    issues on parallel launches, MPI runs notably.
+  * Lots of build fixes for icc on Windows.
+
+StarPU 1.1.2 (svn revision 13011)
 ==============================================
 The scheduling context release
 

+ 9 - 1
Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2013  Université de Bordeaux 1
+# Copyright (C) 2009-2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -97,6 +97,13 @@ if BUILD_STARPU_TOP
 starpu-top/starpu_top$(EXEEXT): all-local
 all-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
+if STARPU_DEVEL
+	@if grep -r sys/time.h $$( find $(srcdir)/src $(srcdir)/mpi/src $(srcdir)/include -name \*.[ch] -a \! -name starpu_util.h ) ; \
+	then \
+		echo "Please do not include sys/time, it is not available on Windows, include starpu_util.h and use starpu_timing_now() instead" ; \
+		false ; \
+	fi
+endif
 clean-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
 	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top$(EXEEXT)
@@ -127,6 +134,7 @@ EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-au
 
 DISTCLEANFILES = STARPU-REVISION
 
+
 include starpu-top/extradist
 
 showcheck:

+ 15 - 1
configure.ac

@@ -1012,7 +1012,7 @@ if test x$enable_simgrid = xyes ; then
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
 		]
 	)
-   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name MSG_environment_get_routing_root])
+   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name MSG_environment_get_routing_root xbt_mutex_try_acquire])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
 		    		[[#include <msg/msg.h>]],
@@ -1945,7 +1945,9 @@ if test "x$STARPU_DEVEL" != x; then
 	IS_SUPPORTED_CFLAG(-Wunused)
 	IS_SUPPORTED_CFLAG(-Wundef)
 	IS_SUPPORTED_CFLAG(-Wshadow)
+	IS_SUPPORTED_CFLAG(-Werror=pointer-arith)
 fi
+AM_CONDITIONAL([STARPU_DEVEL],[test "x$STARPU_DEVEL" != x])
 
 AC_SUBST(GLOBAL_AM_CFLAGS)
 
@@ -2489,6 +2491,18 @@ AC_SUBST(SOCL_VENDORS)
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h gcc-plugin/include/starpu-gcc/config.h starpu-top/config.h)
 
+AH_BOTTOM([
+#if defined(STARPU_DEVEL) && defined(BUILDING_STARPU)
+#  ifndef STARPU_CHECKED_UNISTD_H
+#    define STARPU_CHECKED_UNISTD_H
+#    ifdef _UNISTD_H
+#      define _UNISTD_H PLEASE_DONT_INCLUDE_IT
+#      error Please do not unconditionally include unistd.h, it is not available on Windows, include config.h and test for HAVE_UNISTD_H
+#    endif
+#  endif
+#endif
+])
+
 AC_OUTPUT([
 	Makefile
 	src/Makefile

+ 20 - 0
doc/doxygen/chapters/40environment_variables.doxy

@@ -51,6 +51,16 @@ Specify the number of workers per CUDA device, and thus the number of kernels
 which will be concurrently running on the devices. The default value is 1.
 </dd>
 
+<dt>STARPU_CUDA_PIPELINE</dt>
+<dd>
+\anchor STARPU_CUDA_PIPELINE
+\addindex __env__STARPU_CUDA_PIPELINE
+Specify how many asynchronous tasks are submitted in advance on CUDA
+devices. This for instance permits to overlap task management with the execution
+of previous tasks, but it also allows concurrent execution on Fermi cards, which
+otherwise bring spurious synchronizations. The default is 2.
+</dd>
+
 <dt>STARPU_NOPENCL</dt>
 <dd>
 \anchor STARPU_NOPENCL
@@ -58,6 +68,16 @@ which will be concurrently running on the devices. The default value is 1.
 OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
 </dd>
 
+<dt>STARPU_OPENCL_PIPELINE</dt>
+<dd>
+\anchor STARPU_OPENCL_PIPELINE
+\addindex __env__STARPU_OPENCL_PIPELINE
+Specify how many asynchronous tasks are submitted in advance on OpenCL
+devices. This for instance permits to overlap task management with the execution
+of previous tasks, but it also allows concurrent execution on Fermi cards, which
+otherwise bring spurious synchronizations. The default is 2.
+</dd>
+
 <dt>STARPU_NMICDEVS</dt>
 <dd>
 \anchor STARPU_NMICDEVS

+ 4 - 0
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -55,6 +55,10 @@ provided, it will be used by default if no more specific method is
 provided. It can still be useful to provide more specific method in
 case of e.g. available particular CUDA or OpenCL support.
 \ingroup API_Data_Interfaces
+\var starpu_data_copy_methods::can_copy
+If defined, allows the interface to declare whether it supports transferring
+from \p src_interface on node \p src_node to \p dst_interface on node \p. If not
+defined, it is assumed that the interface supports all transfers.
 \var starpu_data_copy_methods::ram_to_ram
 Define how to copy data from the \p src_interface interface on the \p
 src_node CPU node to the \p dst_interface interface on the \p dst_node

+ 7 - 5
examples/interface/complex_interface.c

@@ -128,9 +128,10 @@ static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **
 	*count = complex_get_size(handle);
 	if (ptr != NULL)
 	{
-		*ptr = malloc(*count);
-		memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
-		memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
+		char *data;
+		data = *ptr = malloc(*count);
+		memcpy(data, complex_interface->real, complex_interface->nx*sizeof(double));
+		memcpy(data+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
 	}
 
 	return 0;
@@ -138,13 +139,14 @@ static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **
 
 static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
 {
+	char *data = ptr;
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
 	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
 		starpu_data_get_interface_on_node(handle, node);
 
-	memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
-	memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
+	memcpy(complex_interface->real, data, complex_interface->nx*sizeof(double));
+	memcpy(complex_interface->imaginary, data+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
 
 	return 0;
 }

+ 5 - 2
examples/pi/SobolQRNG/sobol_gold.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,7 +55,6 @@
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
-#include <strings.h>
 
 #include "sobol.h"
 #include "sobol_gold.h"
@@ -63,6 +62,10 @@
 
 #define k_2powneg32 2.3283064E-10F
 
+#if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__)
+#define ffs(arg) _bit_scan_forward(arg)
+#endif
+
 /* Create the direction numbers, based on the primitive polynomials. */
 void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
 {

+ 1 - 1
examples/pi/SobolQRNG/sobol_primitives.c

@@ -66,7 +66,7 @@
 const struct primitive sobol_primitives[] =
 {
     /* First dimension is a special case so this entry is actually ignored */
-    {1, 0, 0, {}},
+    {1, 0, 0 },
     {2, 1, 0, {1}},
     {3, 2, 1, {1, 3}},
     {4, 3, 1, {1, 3, 1}},

+ 2 - 2
examples/sched_ctx/sched_ctx_without_sched_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,7 +18,7 @@
 #include <starpu.h>
 #include <omp.h>
 
-#ifdef STARPU_QUICK_CHECK
+#ifndef STARPU_QUICK_CHECK
 #define NTASKS 64
 #else
 #define NTASKS 10

+ 2 - 0
include/pthread_win32/pthread.h

@@ -254,7 +254,9 @@ typedef pthread_mutex_t pthread_rwlock_t;
 typedef int pthread_rwlockattr_t;
 #define pthread_rwlock_init(lock, attr) pthread_mutex_init(lock, NULL)
 #define pthread_rwlock_wrlock(lock) pthread_mutex_lock(lock)
+#define pthread_rwlock_trywrlock(lock) pthread_mutex_trylock(lock)
 #define pthread_rwlock_rdlock(lock) pthread_mutex_lock(lock)
+#define pthread_rwlock_tryrdlock(lock) pthread_mutex_trylock(lock)
 #define pthread_rwlock_unlock(lock) pthread_mutex_unlock(lock)
 #define pthread_rwlock_destroy(lock) pthread_mutex_destroy(lock)
 

+ 0 - 8
include/starpu_config.h.in

@@ -109,14 +109,6 @@ typedef ssize_t starpu_ssize_t;
 #  define __starpu_inline __inline__
 #endif
 
-#ifdef _MSC_VER
-struct timespec
-{
-  time_t  tv_sec;  /* Seconds */
-  long    tv_nsec; /* Nanoseconds */
-};
-#endif
-
 #undef STARPU_QUICK_CHECK
 #undef STARPU_USE_DRAND48
 #undef STARPU_USE_ERAND48_R

+ 2 - 0
include/starpu_data_interfaces.h

@@ -37,6 +37,8 @@ extern "C"
 
 struct starpu_data_copy_methods
 {
+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);

+ 4 - 1
include/starpu_profiling.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,6 +21,7 @@
 #include <starpu.h>
 #include <errno.h>
 #include <time.h>
+#include <starpu_util.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -87,6 +88,7 @@ int starpu_profiling_status_get(void);
 
 #ifdef BUILDING_STARPU
 #include <common/utils.h>
+#ifdef __GNUC__
 extern int _starpu_profiling;
 #define starpu_profiling_status_get() ({ \
 	int __ret; \
@@ -96,6 +98,7 @@ extern int _starpu_profiling;
 	__ret; \
 })
 #endif
+#endif
 
 int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info);
 

+ 2 - 2
include/starpu_task.h

@@ -166,12 +166,12 @@ struct starpu_task
 	unsigned destroy:1;
 	unsigned regenerate:1;
 
+	unsigned workerid;
+
 	unsigned scheduled:1;
 
 	unsigned int mf_skip:1;
 
-	unsigned workerid;
-
 	int priority;
 
 	enum starpu_task_status status;

+ 8 - 8
include/starpu_thread.h

@@ -28,7 +28,7 @@ extern "C"
 #ifdef STARPU_SIMGRID
 #include <xbt/synchro_core.h>
 #include <msg/msg.h>
-#elif !defined(_MSC_VER)
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU)
 #include <pthread.h>
 #endif
 #include <stdint.h>
@@ -50,7 +50,7 @@ int starpu_pthread_attr_init(starpu_pthread_attr_t *attr);
 int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr);
 int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate);
 
-#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* STARPU_SIMGRID */
 
 typedef pthread_t starpu_pthread_t;
 typedef pthread_attr_t starpu_pthread_attr_t;
@@ -85,7 +85,7 @@ int starpu_pthread_mutexattr_settype(starpu_pthread_mutexattr_t *attr, int type)
 int starpu_pthread_mutexattr_destroy(starpu_pthread_mutexattr_t *attr);
 int starpu_pthread_mutexattr_init(starpu_pthread_mutexattr_t *attr);
 
-#elif !defined(_MSC_VER) /* !STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
 
 typedef pthread_mutex_t starpu_pthread_mutex_t;
 typedef pthread_mutexattr_t starpu_pthread_mutexattr_t;
@@ -116,7 +116,7 @@ int starpu_pthread_key_delete(starpu_pthread_key_t key);
 int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer);
 void *starpu_pthread_getspecific(starpu_pthread_key_t key);
 
-#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
 
 typedef pthread_key_t starpu_pthread_key_t;
 
@@ -144,7 +144,7 @@ int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t
 int starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime);
 int starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond);
 
-#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
 
 typedef pthread_cond_t starpu_pthread_cond_t;
 typedef pthread_condattr_t starpu_pthread_condattr_t;
@@ -178,7 +178,7 @@ int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
-#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
 
 typedef pthread_rwlock_t starpu_pthread_rwlock_t;
 typedef pthread_rwlockattr_t starpu_pthread_rwlockattr_t;
@@ -198,7 +198,7 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
  * Encapsulation of the pthread_barrier_* functions.
  */
 
-#if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
+#if defined(STARPU_SIMGRID) || (!defined(STARPU_HAVE_PTHREAD_BARRIER) && (!defined(_MSC_VER) || defined(BUILDING_STARPU)))
 
 #if defined(STARPU_SIMGRID) && defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
 typedef xbt_bar_t starpu_pthread_barrier_t;
@@ -270,7 +270,7 @@ typedef pthread_spinlock_t starpu_pthread_spinlock_t;
  * Other needed pthread definitions
  */
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(BUILDING_STARPU)
 typedef void* starpu_pthread_rwlock_t;
 typedef void* starpu_pthread_mutex_t;
 typedef void* starpu_pthread_cond_t;

+ 20 - 0
include/starpu_util.h

@@ -273,6 +273,26 @@ int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_ha
 
 double starpu_timing_now(void);
 
+#ifdef _WIN32
+/* Try to fetch the system definition of timespec */
+#include <time.h>
+#if !defined(_MSC_VER) || defined(BUILDING_STARPU)
+#include <pthread.h>
+#endif
+#ifndef _TIMESPEC_DEFINED
+/* If it didn't get defined in the standard places, then define it ourself */
+#ifndef STARPU_TIMESPEC_DEFINED
+#define STARPU_TIMESPEC_DEFINED 1
+struct timespec {
+  time_t  tv_sec;  /* Seconds */
+  long    tv_nsec; /* Nanoseconds */
+};
+#endif /* STARPU_TIMESPEC_DEFINED */
+#endif
+#else
+#include <sys/time.h>
+#endif /* _WIN32 */
+
 #ifdef __cplusplus
 }
 #endif

+ 2 - 2
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -128,7 +128,7 @@ static void create_matrix()
 
 	filename = malloc(filename_length);
 
-	allocated_memory += nblocks*nblocks*blocksize*sizeof(TYPE *);
+	allocated_memory += nblocks*nblocks*blocksize;
 
 	/* Create the whole matrix on the disk */
 	unsigned i,j;

+ 1 - 1
mpi/src/starpu_mpi.c

@@ -1348,7 +1348,7 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 	argc_argv->argv = argv;
 
 #ifdef STARPU_MPI_ACTIVITY
-	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
+	hookid = starpu_progression_hook_register(_starpu_mpi_progression_hook_func, NULL);
 	STARPU_ASSERT_MSG(hookid >= 0, "starpu_progression_hook_register failed");
 #endif /* STARPU_MPI_ACTIVITY */
 

+ 4 - 4
mpi/src/starpu_mpi_task_insert.c

@@ -422,11 +422,11 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 		}
 		else if (arg_type==STARPU_CALLBACK)
 		{
-			va_arg(varg_list_copy, void (*)(void *));
+			va_arg(varg_list_copy, _starpu_callback_func_t);
 		}
 		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
 		{
-			va_arg(varg_list_copy, void (*)(void *));
+			va_arg(varg_list_copy, _starpu_callback_func_t);
 			va_arg(varg_list_copy, void *);
 		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
@@ -566,11 +566,11 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, struct starpu_codelet *codelet,
 		}
 		else if (arg_type==STARPU_CALLBACK)
 		{
-			va_arg(varg_list_copy, void (*)(void *));
+			va_arg(varg_list_copy, _starpu_callback_func_t);
 		}
 		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
 		{
-			va_arg(varg_list_copy, void (*)(void *));
+			va_arg(varg_list_copy, _starpu_callback_func_t);
 			va_arg(varg_list_copy, void *);
 		}
 		else if (arg_type==STARPU_CALLBACK_ARG)

+ 3 - 3
socl/src/cl_enqueuecopybuffer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010,2011 University of Bordeaux
+ * Copyright (C) 2010,2011, 2014 University of Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -46,8 +46,8 @@ static void soclEnqueueCopyBuffer_cpu_task(void *descr[], void *args) {
   ev->prof_start = _socl_nanotime();
   gc_entity_release(ev);
 
-   void * src = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
-   void * dst = (void*)STARPU_VARIABLE_GET_PTR(descr[1]);
+   char * src = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
+   char * dst = (void*)STARPU_VARIABLE_GET_PTR(descr[1]);
 
    memcpy(dst+cmd->dst_offset, src+cmd->src_offset, cmd->cb);
 

+ 2 - 2
socl/src/cl_enqueuereadbuffer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010,2011 University of Bordeaux
+ * Copyright (C) 2010,2011, 2014 University of Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,7 +23,7 @@ static void soclEnqueueReadBuffer_cpu_task(void *descr[], void *args) {
   ev->prof_start = _socl_nanotime();
   gc_entity_release(ev);
 
-   void * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
+   char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
    DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, ptr+cmd->offset, cmd->ptr);
 
    //This fix is for people who use USE_HOST_PTR and still use ReadBuffer to sync the buffer in host mem at host_ptr.

+ 2 - 2
socl/src/cl_enqueuewritebuffer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010,2011 University of Bordeaux
+ * Copyright (C) 2010,2011, 2014 University of Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@ static void soclEnqueueWriteBuffer_cpu_task(void *descr[], void *args) {
   ev->prof_start = _socl_nanotime();
   gc_entity_release(ev);
 
-   void * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
+   char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
    DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, cmd->ptr, ptr+cmd->offset);
 
    //FIXME: Fix for people who use USE_HOST_PTR, modify data at host_ptr and use WriteBuffer to commit the change.

+ 5 - 1
src/common/fxt.c

@@ -52,6 +52,10 @@ uint64_t fut_getstamp(void)
 
 long _starpu_gettid(void)
 {
+	/* TODO: test at configure whether __thread is available, and use that
+	 * to cache the value.
+	 * Don't use the TSD, this is getting called before we would have the
+	 * time to allocate it.  */
 #ifdef STARPU_SIMGRID
 	return (uintptr_t) MSG_process_self();
 #else
@@ -61,7 +65,7 @@ long _starpu_gettid(void)
 	long tid;
 	thr_self(&tid);
 	return tid;
-#elif defined(__MINGW32__)
+#elif defined(_WIN32) && !defined(__CYGWIN__)
 	return (long) GetCurrentThreadId();
 #else
 	return (long) pthread_self();

+ 3 - 2
src/common/fxt.h

@@ -23,12 +23,13 @@
 #define _GNU_SOURCE  /* ou _BSD_SOURCE ou _SVID_SOURCE */
 #endif
 
-#include <unistd.h>
-
 #include <string.h>
 #include <sys/types.h>
 #include <stdlib.h>
 #include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <common/utils.h>
 #include <starpu.h>
 

+ 8 - 2
src/common/thread.c

@@ -131,13 +131,19 @@ int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
 
 int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
 {
+	int ret;
 	_STARPU_TRACE_TRYLOCK_MUTEX();
 
-	xbt_mutex_acquire(*mutex);
+#ifdef HAVE_XBT_MUTEX_TRY_ACQUIRE
+	ret = xbt_mutex_try_acquire(*mutex);
+#else
+	ret = simcall_mutex_trylock((smx_mutex_t)*mutex);
+#endif
+	ret = ret ? 0 : EBUSY;
 
 	_STARPU_TRACE_MUTEX_LOCKED();
 
-	return 0;
+	return ret;
 }
 
 int starpu_pthread_mutexattr_gettype(const starpu_pthread_mutexattr_t *attr, int *type)

+ 34 - 5
src/common/timing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -15,9 +15,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <sys/time.h>
 #include <starpu.h>
 #include <common/config.h>
+#include <starpu_util.h>
 #include <profiling/profiling.h>
 #include <common/timing.h>
 #include <math.h>
@@ -26,6 +26,10 @@
 #include <msg/msg.h>
 #endif
 
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__)
+#include <windows.h>
+#endif
+
 #ifdef STARPU_SIMGRID
 void _starpu_timing_init(void)
 {
@@ -125,6 +129,30 @@ static unsigned long long _starpu_residual = 0;
 
 static int _starpu_inited = 0;
 
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__)
+static int mygettimeofday(struct timeval tv, void *tz)
+{
+	if (tv)
+	{
+		FILETIME ft;
+		unsigned long long res;
+		GetSystemTimeAsFileTime(&ft);
+		/* 100-nanosecond intervals since January 1, 1601 */
+		res = ft.dwHighDateTime;
+		res <<= 32;
+		res |= ft.dwLowDateTime;
+		res /= 10;
+		/* Now we have microseconds */
+		res -= (((1970-1601)*365) + 89) * 24ULL * 3600ULL * 1000000ULL;
+		/* Now we are based on epoch */
+		tv->tv_sec = res / 1000000ULL;
+		tv->tv_usec = res % 1000000ULL;
+	}
+}
+#else
+#define mygettimeofday(tv,tz) gettimeofday(tv,tz)
+#endif
+
 void _starpu_timing_init(void)
 {
 	static union starpu_u_tick t1, t2;
@@ -143,12 +171,13 @@ void _starpu_timing_init(void)
 
 	{
 		struct timeval tv1,tv2;
+		struct timespec ts = { .tv_sec = 0, .tv_nsec = 500000000UL };
 
 		STARPU_GET_TICK(t1);
-		gettimeofday(&tv1,0);
-		usleep(500000);
+		mygettimeofday(&tv1,0);
+		_starpu_sleep(ts);
 		STARPU_GET_TICK(t2);
-		gettimeofday(&tv2,0);
+		mygettimeofday(&tv2,0);
 		_starpu_scale = ((tv2.tv_sec*1e6 + tv2.tv_usec) -
 				 (tv1.tv_sec*1e6 + tv1.tv_usec)) /
 			(double)(STARPU_TICK_DIFF(t1, t2));

+ 5 - 3
src/common/timing.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,11 +23,13 @@
  * functions.
  */
 
-#include <sys/time.h>
-#include <unistd.h>
 #include <stdint.h>
 #include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <starpu.h>
+#include <starpu_util.h>
 
 void _starpu_timing_init(void);
 void _starpu_clock_gettime(struct timespec *ts);

+ 92 - 3
src/common/utils.c

@@ -18,13 +18,36 @@
 #include <starpu.h>
 #include <common/config.h>
 #include <common/utils.h>
-#include <libgen.h>
 #include <errno.h>
+#ifdef HAVE_UNISTD_H
 #include <unistd.h>
+#endif
+#include <fcntl.h>
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <io.h>
+#include <sys/locking.h>
 #define mkdir(path, mode) mkdir(path)
+#if !defined(__MINGW32__)
+#define ftruncate(fd, length) _chsize(fd, length)
+#endif
+#endif
+
+#if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__)
+#include <direct.h>
+static char * dirname(char * path)
+{
+   char drive[_MAX_DRIVE];
+   char dir[_MAX_DIR];
+   /* Remove trailing slash */
+   while (strlen(path) > 0 && (*(path+strlen(path)-1) == '/' || *(path+strlen(path)-1) == '\\'))
+      *(path+strlen(path)-1) = '\0';
+   _splitpath(path, drive, dir, NULL, NULL);
+   _makepath(path, drive, dir, NULL, NULL);
+   return path;
+}
+#else
+#include <libgen.h>
 #endif
 
 /* Function with behaviour like `mkdir -p'. This function was adapted from
@@ -38,7 +61,7 @@ int _starpu_mkpath(const char *s, mode_t mode)
 
 	rv = -1;
 	if (strcmp(s, ".") == 0 || strcmp(s, "/") == 0
-#ifdef __MINGW32__
+#if defined(_WIN32)
 		/* C:/ or C:\ */
 		|| (s[0] && s[1] == ':' && (s[2] == '/' || s[2] == '\\') && !s[3])
 #endif
@@ -102,6 +125,72 @@ void _starpu_mkpath_and_check(const char *path, mode_t mode)
 	}
 }
 
+int _starpu_ftruncate(FILE *file)
+{
+	return ftruncate(fileno(file), 0);
+}
+
+int _starpu_frdlock(FILE *file)
+{
+#if defined(_WIN32) && !defined(__CYGWIN__)
+	int ret;
+	do {
+		ret = _locking(fileno(file), _LK_RLCK, 10);
+	} while (ret == EDEADLOCK);
+	return ret;
+#else
+	struct flock lock = {
+		.l_type = F_RDLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+#endif
+}
+
+int _starpu_frdunlock(FILE *file)
+{
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#  ifndef _LK_UNLCK
+#    define _LK_UNLCK _LK_UNLOCK
+#  endif
+	return _locking(fileno(file), _LK_UNLCK, 10);
+#else
+	struct flock lock = {
+		.l_type = F_UNLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+#endif
+}
+
+int _starpu_fwrlock(FILE *file)
+{
+#if defined(_WIN32) && !defined(__CYGWIN__)
+	int ret;
+	do {
+		ret = _locking(fileno(file), _LK_LOCK, 10);
+	} while (ret == EDEADLOCK);
+	return ret;
+#else
+	struct flock lock = {
+		.l_type = F_WRLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+#endif
+}
+
+int _starpu_fwrunlock(FILE *file)
+{
+	return _starpu_frdunlock(file);
+}
+
 int _starpu_check_mutex_deadlock(starpu_pthread_mutex_t *mutex)
 {
 	int ret;

+ 9 - 0
src/common/utils.h

@@ -101,10 +101,19 @@
 	} while (0)
 
 
+#ifdef _MSC_VER
+#define _STARPU_IS_ZERO(a) (a == 0.0)
+#else
 #define _STARPU_IS_ZERO(a) (fpclassify(a) == FP_ZERO)
+#endif
 
 int _starpu_mkpath(const char *s, mode_t mode);
 void _starpu_mkpath_and_check(const char *s, mode_t mode);
+int _starpu_ftruncate(FILE *file);
+int _starpu_frdlock(FILE *file);
+int _starpu_frdunlock(FILE *file);
+int _starpu_fwrlock(FILE *file);
+int _starpu_fwrunlock(FILE *file);
 char *_starpu_get_home_path(void);
 void _starpu_gethostname(char *hostname, size_t size);
 

+ 1 - 1
src/core/combined_workers.c

@@ -25,7 +25,7 @@
 #include <sched.h>
 #endif
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 

+ 8 - 8
src/core/disk_ops/disk_leveldb.cpp

@@ -266,8 +266,8 @@ get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
 
 	unsigned iter;
 	double timing_slowness, timing_latency;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	
 	srand (time (NULL)); 
 	char * buf = (char *) malloc(SIZE_DISK_MIN*sizeof(char));
@@ -281,13 +281,13 @@ get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
 	struct starpu_leveldb_obj * tmp = (struct starpu_leveldb_obj *) mem;
 
 	/* Measure upload slowness */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; ++iter)
 	{
 		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, SIZE_DISK_MIN, NULL);
 	}
-	gettimeofday(&end, NULL);
-	timing_slowness = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing_slowness = end - start;
 
 
 	/* free memory */
@@ -297,13 +297,13 @@ get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
 	STARPU_ASSERT(buf != NULL);
 
 	/* Measure latency */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; ++iter)
 	{
 		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (SIZE_DISK_MIN -1) , 1, NULL);
 	}
-	gettimeofday(&end, NULL);
-	timing_latency = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing_latency = end - start;
 
 	_starpu_disk_free(node, mem, SIZE_DISK_MIN);
 	free(buf);

+ 8 - 9
src/core/disk_ops/disk_stdio.c

@@ -18,7 +18,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <sys/time.h>
 #include <errno.h>
 #include <time.h>
 
@@ -269,8 +268,8 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 {
 	unsigned iter;
 	double timing_slowness, timing_latency;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	srand (time (NULL));
 	char * buf = malloc(SIZE_DISK_MIN);
@@ -286,7 +285,7 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	memset(buf, 0, SIZE_DISK_MIN);
 
 	/* Measure upload slowness */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; ++iter)
 	{
 		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, SIZE_DISK_MIN, NULL);
@@ -301,8 +300,8 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 #endif
 		STARPU_ASSERT_MSG(res == 0, "Slowness computation failed \n");
 	}
-	gettimeofday(&end, NULL);
-	timing_slowness = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing_slowness = end - start;
 
 
 	/* free memory */
@@ -314,7 +313,7 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	*buf = 0;
 
 	/* Measure latency */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; ++iter)
 	{
 		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (SIZE_DISK_MIN -1) , 1, NULL);
@@ -329,8 +328,8 @@ static int get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 #endif
 		STARPU_ASSERT_MSG(res == 0, "Latency computation failed");
 	}
-	gettimeofday(&end, NULL);
-	timing_latency = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing_latency = end - start;
 
 	_starpu_disk_free(node, mem, SIZE_DISK_MIN);
 	free(buf);

+ 4 - 2
src/core/disk_ops/disk_unistd.c

@@ -15,12 +15,14 @@
  */
 
 #include <fcntl.h>
-#include <unistd.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <sys/time.h>
 #include <stdint.h>
 
+#include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <starpu.h>
 #include <core/disk.h>
 #include <core/perfmodel/perfmodel.h>

+ 4 - 2
src/core/disk_ops/disk_unistd_o_direct.c

@@ -15,12 +15,14 @@
  */
 
 #include <fcntl.h>
-#include <unistd.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <sys/time.h>
 #include <stdint.h>
 
+#include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <starpu.h>
 #include <core/disk.h>
 #include <core/perfmodel/perfmodel.h>

+ 12 - 10
src/core/disk_ops/unistd/disk_unistd_global.c

@@ -15,16 +15,18 @@
  */
 
 #include <fcntl.h>
-#include <unistd.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <sys/time.h>
 #include <stdint.h>
 #ifdef HAVE_AIO_H
 #include <aio.h>
 #endif
 #include <errno.h>
 
+#include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <starpu.h>
 #include <core/disk.h>
 #include <core/perfmodel/perfmodel.h>
@@ -315,8 +317,8 @@ get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	int res;
 	unsigned iter;
 	double timing_slowness, timing_latency;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	srand (time (NULL)); 
 	char * buf;
@@ -332,7 +334,7 @@ get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	struct starpu_unistd_global_obj * tmp = (struct starpu_unistd_global_obj *) mem;
 
 	/* Measure upload slowness */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; ++iter)
 	{
 		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, SIZE_DISK_MIN, NULL);
@@ -345,8 +347,8 @@ get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 
 		STARPU_ASSERT_MSG(res == 0, "bandwidth computation failed");
 	}
-	gettimeofday(&end, NULL);
-	timing_slowness = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing_slowness = end - start;
 
 
 	/* free memory */
@@ -356,7 +358,7 @@ get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	STARPU_ASSERT(buf != NULL);
 
 	/* Measure latency */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; ++iter)
 	{
 		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (SIZE_DISK_MIN -1) , MEM_SIZE, NULL);
@@ -369,8 +371,8 @@ get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 
 		STARPU_ASSERT_MSG(res == 0, "Latency computation failed");
 	}
-	gettimeofday(&end, NULL);
-	timing_latency = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing_latency = end - start;
 
 	_starpu_disk_free(node, mem, SIZE_DISK_MIN);
 	starpu_free(buf);

+ 3 - 1
src/core/jobs.h

@@ -24,10 +24,12 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
-#include <unistd.h>
 #include <string.h>
 #include <stdarg.h>
 #include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <common/timing.h>
 #include <common/list.h>
 #include <common/fxt.h>

+ 2 - 0
src/core/perfmodel/perfmodel.c

@@ -20,7 +20,9 @@
 #include <starpu_profiling.h>
 #include <common/config.h>
 #include <common/utils.h>
+#ifdef HAVE_UNISTD_H
 #include <unistd.h>
+#endif
 #include <sys/stat.h>
 #include <core/perfmodel/perfmodel.h>
 #include <core/jobs.h>

+ 65 - 40
src/core/perfmodel/perfmodel_bus.c

@@ -22,8 +22,6 @@
 #endif
 #include <sched.h>
 #endif
-#include <unistd.h>
-#include <sys/time.h>
 #include <stdlib.h>
 #include <math.h>
 
@@ -31,6 +29,9 @@
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <core/workers.h>
 #include <core/perfmodel/perfmodel.h>
 #include <core/simgrid.h>
@@ -165,54 +166,54 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
 	unsigned iter;
 	double timing;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	/* Measure upload bandwidth */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
 		cudaThreadSynchronize();
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
 
 	/* Measure download bandwidth */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
 		cudaThreadSynchronize();
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
 
 	/* Measure upload latency */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
 		cudaThreadSynchronize();
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod = timing/NITER;
 
 	/* Measure download latency */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpy(h_buffer, d_buffer, 1, cudaMemcpyDeviceToHost);
 		cudaThreadSynchronize();
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh = timing/NITER;
 
@@ -286,30 +287,30 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
 	unsigned iter;
 	double timing;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	/* Measure upload bandwidth */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
 		cudaThreadSynchronize();
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	cudadev_timing_dtod[src+1][dst+1] = timing/NITER/size;
 
 	/* Measure upload latency */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpyPeer(d_buffer, dst, s_buffer, src, 1);
 		cudaThreadSynchronize();
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	cudadev_latency_dtod[src+1][dst+1] = timing/NITER;
 
@@ -391,58 +392,58 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 
         unsigned iter;
 	double timing;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	/* Measure upload bandwidth */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
                 err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
                 if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
                 clFinish(queue);
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
 
 	/* Measure download bandwidth */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
                 err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
                 if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
                 clFinish(queue);
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
 
 	/* Measure upload latency */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
 		err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
                 if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
                 clFinish(queue);
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod = timing/NITER;
 
 	/* Measure download latency */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	for (iter = 0; iter < NITER; iter++)
 	{
 		err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
                 if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
                 clFinish(queue);
 	}
-	gettimeofday(&end, NULL);
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	timing = end - start;
 
 	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh = timing/NITER;
 
@@ -783,6 +784,8 @@ static void load_bus_affinity_file_content(void)
 	f = fopen(path, "r");
 	STARPU_ASSERT(f);
 
+	_starpu_frdlock(f);
+
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	ncpus = _starpu_topology_get_nhwcpu(config);
         unsigned gpu;
@@ -835,6 +838,7 @@ static void load_bus_affinity_file_content(void)
 		STARPU_ASSERT(ret == 0);
 	}
 #endif /* !STARPU_USE_OPENCL */
+	_starpu_frdunlock(f);
 
 	fclose(f);
 #endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
@@ -862,6 +866,7 @@ static void write_bus_affinity_file_content(void)
 		STARPU_ABORT();
 	}
 
+	_starpu_frdlock(f);
 	unsigned cpu;
         unsigned gpu;
 
@@ -897,6 +902,7 @@ static void write_bus_affinity_file_content(void)
 	}
 #endif
 
+	_starpu_frdunlock(f);
 	fclose(f);
 #endif
 }
@@ -1006,6 +1012,7 @@ static int load_bus_latency_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_frdlock(f);
 
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
@@ -1073,13 +1080,14 @@ static int load_bus_latency_file_content(void)
 			break;
 		ungetc(n, f);
 	}
+	_starpu_frdunlock(f);
+	fclose(f);
 
 	/* No more values, take NAN */
 	for ( ; src < STARPU_MAXNODES; src++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 			latency_matrix[src][dst] = NAN;
 
-	fclose(f);
 	return 1;
 }
 
@@ -1104,6 +1112,8 @@ static void write_bus_latency_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
 	fprintf(f, "# ");
 	for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1163,6 +1173,7 @@ static void write_bus_latency_file_content(void)
 
 		fprintf(f, "\n");
 	}
+	_starpu_fwrunlock(f);
 
 	fclose(f);
 }
@@ -1223,6 +1234,7 @@ static int load_bus_bandwidth_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_frdlock(f);
 
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
@@ -1290,13 +1302,14 @@ static int load_bus_bandwidth_file_content(void)
 			break;
 		ungetc(n, f);
 	}
+	_starpu_frdunlock(f);
+	fclose(f);
 
 	/* No more values, take NAN */
 	for ( ; src < STARPU_MAXNODES; src++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 			latency_matrix[src][dst] = NAN;
 
-	fclose(f);
 	return 1;
 }
 
@@ -1316,6 +1329,9 @@ static void write_bus_bandwidth_file_content(void)
 	f = fopen(path, "w+");
 	STARPU_ASSERT(f);
 
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
+
 	fprintf(f, "# ");
 	for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		fprintf(f, "to %d\t\t", dst);
@@ -1387,6 +1403,7 @@ static void write_bus_bandwidth_file_content(void)
 		fprintf(f, "\n");
 	}
 
+	_starpu_fwrunlock(f);
 	fclose(f);
 }
 #endif /* STARPU_SIMGRID */
@@ -1551,6 +1568,7 @@ static void check_bus_config_file(void)
                 // Loading configuration from file
                 f = fopen(path, "r");
                 STARPU_ASSERT(f);
+		_starpu_frdlock(f);
                 _starpu_drop_comments(f);
                 ret = fscanf(f, "%u\t", &read_cpus);
 		STARPU_ASSERT(ret == 1);
@@ -1565,6 +1583,7 @@ static void check_bus_config_file(void)
 		if (ret == 0)
 			read_mic = 0;
                 _starpu_drop_comments(f);
+		_starpu_frdunlock(f);
                 fclose(f);
 
                 // Loading current configuration
@@ -1619,6 +1638,8 @@ static void write_bus_config_file_content(void)
 
         f = fopen(path, "w+");
 	STARPU_ASSERT(f);
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
         fprintf(f, "# Current configuration\n");
         fprintf(f, "%u # Number of CPUs\n", ncpus);
@@ -1626,6 +1647,7 @@ static void write_bus_config_file_content(void)
         fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
         fprintf(f, "%d # Number of MIC devices\n", nmic);
 
+	_starpu_fwrunlock(f);
         fclose(f);
 }
 
@@ -1664,6 +1686,8 @@ static void write_bus_platform_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
 	fprintf(f,
 "<?xml version='1.0'?>\n"
@@ -1810,6 +1834,7 @@ static void write_bus_platform_file_content(void)
 " </platform>\n"
 		);
 
+	_starpu_fwrunlock(f);
 	fclose(f);
 }
 

+ 22 - 1
src/core/perfmodel/perfmodel_history.c

@@ -16,11 +16,15 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#if !defined(_WIN32) || defined(__MINGW__) || defined(__CYGWIN__)
 #include <dirent.h>
-#include <unistd.h>
 #include <sys/stat.h>
+#endif
 #include <errno.h>
 #include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <common/utils.h>
 #include <core/perfmodel/perfmodel.h>
 #include <core/jobs.h>
@@ -732,7 +736,10 @@ static void save_history_based_model(struct starpu_perfmodel *model)
 	f = fopen(path, "w+");
 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
 
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 	dump_model_file(f, model);
+	_starpu_fwrunlock(f);
 
 	fclose(f);
 }
@@ -937,7 +944,9 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 				f = fopen(path, "r");
 				STARPU_ASSERT(f);
 
+				_starpu_frdlock(f);
 				parse_model_file(f, model, scan_history);
+				_starpu_frdunlock(f);
 
 				fclose(f);
 			}
@@ -966,6 +975,7 @@ void starpu_perfmodel_directory(FILE *output)
  * the performance model files */
 int starpu_perfmodel_list(FILE *output)
 {
+#if !defined(_WIN32) || defined(__MINGW__) || defined(__CYGWIN__)
         char path[256];
         DIR *dp;
         struct dirent *ep;
@@ -989,6 +999,10 @@ int starpu_perfmodel_list(FILE *output)
 		_STARPU_DISP("Could not open the perfmodel directory <%s>: %s\n", path, strerror(errno));
         }
 	return 0;
+#else
+	fprintf(stderr,"Listing perfmodels is not implemented on pure Windows yet\n");
+	return 1;
+#endif
 }
 
 /* This function is intended to be used by external tools that should read the
@@ -1026,8 +1040,13 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 
 	FILE *f = fopen(path, "r");
 	STARPU_ASSERT(f);
+
+	_starpu_frdlock(f);
 	starpu_perfmodel_init(NULL, model);
+	rewind(f);
+
 	parse_model_file(f, model, 1);
+	_starpu_frdunlock(f);
 
 	STARPU_ASSERT(fclose(f) == 0);
 
@@ -1401,6 +1420,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			_STARPU_DISP("Error <%s> when opening file <%s>\n", strerror(errno), per_arch_model->debug_path);
 			STARPU_ABORT();
 		}
+		_starpu_fwrlock(f);
 
 		if (!j->footprint_is_computed)
 			(void) _starpu_compute_buffers_footprint(model, arch, impl, j);
@@ -1420,6 +1440,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			handle->ops->display(handle, f);
 		}
 		fprintf(f, "\n");
+		_starpu_fwrunlock(f);
 		fclose(f);
 #endif
 		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);

+ 8 - 2
src/core/perfmodel/perfmodel_nan.c

@@ -21,7 +21,7 @@
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
-#include <config.h>
+#include <common/config.h>
 #include <core/perfmodel/perfmodel.h>
 #include <ctype.h>
 
@@ -54,8 +54,14 @@ int _starpu_read_double(FILE *f, char *format, double *val)
 	     int x3 = getc(f);
 	     if (x2 == 'a' && x3 == 'n')
 	     {
+#ifdef _MSC_VER
+		     unsigned long long _mynan = 0x7fffffffffffffffull;
+		     double mynan = *(double*)&_mynan;
+#else
+		     double mynan = NAN;
+#endif
 		     _starpu_read_spaces(f);
-		     *val = NAN;
+		     *val = mynan;
 		     return 1;
 	     }
 	     else

+ 2 - 2
src/core/sched_ctx.c

@@ -868,10 +868,10 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 		_starpu_delete_sched_ctx(sched_ctx);
 	}
 
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
 	/* workerids is malloc-ed in starpu_sched_ctx_get_workers_list, don't forget to free it when
 	   you don't use it anymore */
 	free(workerids);
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
 	_starpu_relock_mutex_if_prev_locked();
 	return;
 }
@@ -1806,7 +1806,7 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 		STARPU_ABORT();
 	}
 
-#elif defined(__MINGW32__) || defined(__CYGWIN__)
+#elif defined(_WIN32)
 	DWORD mask = 1 << cpuid;
 	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
 	{

+ 6 - 0
src/core/simgrid.c

@@ -16,7 +16,10 @@
 
 #include <starpu.h>
 #include <datawizard/memory_nodes.h>
+#include <common/config.h>
+#ifdef HAVE_UNISTD_H
 #include <unistd.h>
+#endif
 #include <core/perfmodel/perfmodel.h>
 #include <core/workers.h>
 #include <core/simgrid.h>
@@ -217,6 +220,7 @@ void _starpu_simgrid_init()
 		/* Get XML platform */
 		_starpu_simgrid_get_platform_path(path, sizeof(path));
 		in = fopen(path, "r");
+		_starpu_frdlock(in);
 		STARPU_ASSERT_MSG(in, "Could not open platform file %s", path);
 #ifdef HAVE_MKSTEMPS
 		out = mkstemps(template, strlen(".xml"));
@@ -230,6 +234,8 @@ void _starpu_simgrid_init()
 		snprintf(cmdline, sizeof(cmdline), "xsltproc --novalid --stringparam ASname %s -o %s "STARPU_DATADIR"/starpu/starpu_smpi.xslt %s", asname, template, path);
 		ret = system(cmdline);
 		STARPU_ASSERT_MSG(ret == 0, "running xsltproc to generate SMPI platforms %s from %s failed", template, path);
+		_starpu_frdunlock(in);
+		fclose(in);
 
 		/* And create it */
 		MSG_create_environment(template);

+ 4 - 0
src/core/task.c

@@ -1019,7 +1019,11 @@ static void *watchdog_func(void *foo STARPU_ATTRIBUTE_UNUSED)
 	if (! (timeout_env = getenv("STARPU_WATCHDOG_TIMEOUT")))
 		return NULL;
 
+#ifdef _MSC_VER
+	timeout = (unsigned long long) _atoi64(timeout_env);
+#else
 	timeout = atoll(timeout_env);
+#endif
 	ts.tv_sec = timeout / 1000000;
 	ts.tv_nsec = (timeout % 1000000) * 1000;
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();

+ 2 - 2
src/core/topology.c

@@ -458,7 +458,7 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 
 	config->topology.nhwcpus = config->topology.nhwpus = sysconf(_SC_NPROCESSORS_ONLN);
 
-#elif defined(__MINGW32__) || defined(__CYGWIN__)
+#elif defined(_WIN32)
 	/* Discover the CPUs on Cygwin and MinGW systems. */
 
 	SYSTEM_INFO sysinfo;
@@ -1152,7 +1152,7 @@ _starpu_bind_thread_on_cpu (
 		STARPU_ABORT();
 	}
 
-#elif defined(__MINGW32__) || defined(__CYGWIN__)
+#elif defined(_WIN32)
 	DWORD mask = 1 << cpuid;
 	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
 	{

+ 6 - 2
src/core/workers.c

@@ -42,7 +42,7 @@
 #include <core/simgrid.h>
 #endif
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 
@@ -436,6 +436,10 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
 	starpu_task_list_init(&workerarg->local_tasks);
 	workerarg->current_task = NULL;
+	workerarg->first_task = 0;
+	workerarg->ntasks = 0;
+	workerarg->pipeline_length = 0;
+	workerarg->pipeline_stuck = 0;
 	workerarg->set = NULL;
 
 	/* if some codelet's termination cannot be handled directly :
@@ -1026,7 +1030,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	initialized = CHANGING;
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 	WSADATA wsadata;
 	WSAStartup(MAKEWORD(1,0), &wsadata);
 #endif

+ 8 - 1
src/core/workers.h

@@ -52,6 +52,8 @@
 
 #include <starpu_parameters.h>
 
+#define STARPU_MAX_PIPELINE 4
+
 /* This is initialized from in _starpu_worker_init */
 LIST_TYPE(_starpu_worker,
 	struct _starpu_machine_config *config;
@@ -73,7 +75,12 @@ LIST_TYPE(_starpu_worker,
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
-	struct starpu_task *current_task; /* task currently executed by this worker */
+	struct starpu_task *current_task; /* task currently executed by this worker (non-pipelined version) */
+	struct starpu_task *current_tasks[STARPU_MAX_PIPELINE]; /* tasks currently executed by this worker (pipelined version) */
+	unsigned char first_task; /* Index of first task in the pipeline */
+	unsigned char ntasks; /* number of tasks in the pipeline */
+	unsigned char pipeline_length; /* number of tasks to be put in the pipeline */
+	unsigned char pipeline_stuck; /* whether a task prevents us from pipelining */
 	struct _starpu_worker_set *set; /* in case this worker belongs to a set */
 	struct _starpu_job_list *terminated_jobs; /* list of pending jobs which were executed */
 	unsigned worker_is_running;

+ 33 - 2
src/datawizard/coherency.c

@@ -42,6 +42,8 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 	double cost = INFINITY;
 	unsigned src_node_mask = 0;
 
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
 	for (node = 0; node < nnodes; node++)
 	{
 		if (handle->per_node[node].state != STARPU_INVALID)
@@ -73,6 +75,15 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 				double time = starpu_transfer_predict(i, destination, size);
 				unsigned handling_node;
 
+				/* Avoid transfers which the interface does not want */
+				if (copy_methods->can_copy)
+				{
+					void *src_interface = handle->per_node[i].data_interface;
+					void *dst_interface = handle->per_node[destination].data_interface;
+					if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
+						continue;
+				}
+
 				/* Avoid indirect transfers */
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 					continue;
@@ -105,8 +116,28 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 		
 		if (src_node_mask & (1<<i))
 		{
+			/* Avoid transfers which the interface does not want */
+			if (copy_methods->can_copy)
+			{
+				void *src_interface = handle->per_node[i].data_interface;
+				void *dst_interface = handle->per_node[destination].data_interface;
+				unsigned handling_node;
+
+				if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
+					continue;
+
+				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
+				{
+					/* Avoid through RAM if the interface does not want it */
+					void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
+					if (!copy_methods->can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM)
+					 || !copy_methods->can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination))
+						continue;
+				}
+			}
+
 			/* however GPU are expensive sources, really !
-			 * 	Unless peer transfer is supported.
+			 * 	Unless peer transfer is supported (and it would then have been selected above).
 			 * 	Other should be ok */
 
 			if (starpu_node_get_kind(i) == STARPU_CUDA_RAM ||
@@ -223,7 +254,7 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 {
 	(void) handle; // unused
 
-	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
+	/* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
 	 * method ! */
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)

+ 23 - 23
src/datawizard/copy_driver.c

@@ -524,30 +524,30 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
 	{
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
-		memcpy((void *) dst + dst_offset, (void *) src + src_offset, size);
+		memcpy((void *) (dst + dst_offset), (void *) (src + src_offset), size);
 		return 0;
 
 #ifdef STARPU_USE_CUDA
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
 		return starpu_cuda_copy_async_sync(
-				(void*) src + src_offset, src_node,
-				(void*) dst + dst_offset, dst_node,
+				(void*) (src + src_offset), src_node,
+				(void*) (dst + dst_offset), dst_node,
 				size,
 				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
 				cudaMemcpyDeviceToHost);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
 		return starpu_cuda_copy_async_sync(
-				(void*) src + src_offset, src_node,
-				(void*) dst + dst_offset, dst_node,
+				(void*) (src + src_offset), src_node,
+				(void*) (dst + dst_offset), dst_node,
 				size,
 				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
 				cudaMemcpyHostToDevice);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
 		return starpu_cuda_copy_async_sync(
-				(void*) src + src_offset, src_node,
-				(void*) dst + dst_offset, dst_node,
+				(void*) (src + src_offset), src_node,
+				(void*) (dst + dst_offset), dst_node,
 				size,
 				async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
 				cudaMemcpyDeviceToDevice);
@@ -567,54 +567,54 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
 		if (async_data)
 			return _starpu_mic_copy_mic_to_ram_async(
-					(void*) src + src_offset, src_node,
-					(void*) dst + dst_offset, dst_node,
+					(void*) (src + src_offset), src_node,
+					(void*) (dst + dst_offset), dst_node,
 					size);
 		else
 			return _starpu_mic_copy_mic_to_ram(
-					(void*) src + src_offset, src_node,
-					(void*) dst + dst_offset, dst_node,
+					(void*) (src + src_offset), src_node,
+					(void*) (dst + dst_offset), dst_node,
 					size);
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
 		if (async_data)
 			return _starpu_mic_copy_ram_to_mic_async(
-					(void*) src + src_offset, src_node,
-					(void*) dst + dst_offset, dst_node,
+					(void*) (src + src_offset), src_node,
+					(void*) (dst + dst_offset), dst_node,
 					size);
 		else
 			return _starpu_mic_copy_ram_to_mic(
-					(void*) src + src_offset, src_node,
-					(void*) dst + dst_offset, dst_node,
+					(void*) (src + src_offset), src_node,
+					(void*) (dst + dst_offset), dst_node,
 					size);
 #endif
 #ifdef STARPU_USE_SCC
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
 		return _starpu_scc_copy_sink_to_src(
-				(void*) src + src_offset, src_node,
-				(void*) dst + dst_offset, dst_node,
+				(void*) (src + src_offset), src_node,
+				(void*) (dst + dst_offset), dst_node,
 				size);
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
 		return _starpu_scc_copy_src_to_sink(
-				(void*) src + src_offset, src_node,
-				(void*) dst + dst_offset, dst_node,
+				(void*) (src + src_offset), src_node,
+				(void*) (dst + dst_offset), dst_node,
 				size);
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
 		return _starpu_scc_copy_sink_to_sink(
-				(void*) src + src_offset, src_node,
-				(void*) dst + dst_offset, dst_node,
+				(void*) (src + src_offset), src_node,
+				(void*) (dst + dst_offset), dst_node,
 				size);
 #endif
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_DISK_RAM):
 	{
 		return _starpu_disk_copy_src_to_disk(
-			(void*) src + src_offset, src_node,
+			(void*) (src + src_offset), src_node,
 			(void*) dst, dst_offset, dst_node,
 			size, async_channel);
 	}
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM, STARPU_CPU_RAM):
 		return _starpu_disk_copy_disk_to_src(
 			(void*) src, src_offset, src_node,
-			(void*) dst + dst_offset, dst_node,
+			(void*) (dst + dst_offset), dst_node,
 			size, async_channel);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM, STARPU_DISK_RAM):

+ 7 - 3
src/datawizard/data_request.c

@@ -616,9 +616,13 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 		return 0;
 
 	empty_list = _starpu_data_request_list_new();
-	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]) && !force)
-		/* List is busy, do not bother with it */
-		return 0;
+	if (force)
+		/* We really want to handle requests */
+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
+	else
+		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]))
+			/* List is busy, do not bother with it */
+			return 0;
 
 	/* for all entries of the list */
 	struct _starpu_data_request_list *local_list = data_requests_pending[src_node];

+ 4 - 1
src/datawizard/filters.c

@@ -437,12 +437,15 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 	}
 
 	/* there is no child anymore */
-	free(root_handle->children);
+	starpu_data_handle_t children = root_handle->children;
 	root_handle->children = NULL;
 	root_handle->nchildren = 0;
 
 	/* now the parent may be used again so we release the lock */
 	_starpu_spin_unlock(&root_handle->header_lock);
+
+	free(children);
+
 	_STARPU_TRACE_END_UNPARTITION(root_handle, gathering_node);
 }
 

+ 8 - 6
src/datawizard/interfaces/block_interface.c

@@ -225,14 +225,14 @@ static int pack_block_handle(starpu_data_handle_t handle, unsigned node, void **
 	if (ptr != NULL)
 	{
 		uint32_t z, y;
-		void *block = (void *)block_interface->ptr;
+		char *block = (void *)block_interface->ptr;
 
 		*ptr = malloc(*count);
 
-		void *cur = *ptr;
+		char *cur = *ptr;
 		for(z=0 ; z<block_interface->nz ; z++)
 		{
-			void *block_z = block;
+			char *block_z = block;
 			for(y=0 ; y<block_interface->ny ; y++)
 			{
 				memcpy(cur, block, block_interface->nx*block_interface->elemsize);
@@ -256,11 +256,11 @@ static int unpack_block_handle(starpu_data_handle_t handle, unsigned node, void
 	STARPU_ASSERT(count == block_interface->elemsize * block_interface->nx * block_interface->ny * block_interface->nz);
 
 	uint32_t z, y;
-	void *cur = ptr;
-	void *block = (void *)block_interface->ptr;
+	char *cur = ptr;
+	char *block = (void *)block_interface->ptr;
 	for(z=0 ; z<block_interface->nz ; z++)
 	{
-		void *block_z = block;
+		char *block_z = block;
 		for(y=0 ; y<block_interface->ny ; y++)
 		{
 			memcpy(block, cur, block_interface->nx*block_interface->elemsize);
@@ -439,6 +439,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	else
 	{
 		/* Default case: we transfer all blocks one by one: nz transfers */
+		/* TODO: use cudaMemcpy3D now that it works */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
@@ -509,6 +510,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 	else
 	{
 		/* Default case: we transfer all blocks one by one: nz 2D transfers */
+		/* TODO: use cudaMemcpy3D now that it works */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{

+ 4 - 2
src/datawizard/interfaces/data_interface.c

@@ -499,9 +499,10 @@ int _starpu_data_release_tag(starpu_data_handle_t handle)
 		STARPU_ASSERT_MSG((tag_entry != NULL),"Data handle %p with tag %d isn't in the hashmap !",handle,handle->tag);
 
 		HASH_DEL(registered_tag_handles, tag_entry);
-		free(tag_entry);
 
 		_starpu_spin_unlock(&registered_tag_handles_lock);
+
+		free(tag_entry);
 	}
 	return 0;
 }
@@ -530,9 +531,10 @@ void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 		STARPU_ASSERT(entry != NULL);
 
 		HASH_DEL(registered_handles, entry);
-		free(entry);
 
 		_starpu_spin_unlock(&registered_handles_lock);
+
+		free(entry);
 	}
 }
 

+ 11 - 123
src/datawizard/interfaces/matrix_interface.c

@@ -27,19 +27,14 @@
 #include <drivers/scc/driver_scc_source.h>
 #include <drivers/mic/driver_mic_source.h>
 
-/* If you can promise that there is no stride in your matrices, you can define this */
-// #define NO_STRIDE
-
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-#ifdef NO_STRIDE
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
 #endif
-#endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
@@ -58,17 +53,13 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_cuda = copy_cuda_to_cuda,
-#ifdef NO_STRIDE
 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#endif
 #else
 #ifdef STARPU_SIMGRID
-#ifdef NO_STRIDE
 	/* Enable GPU-GPU transfers in simgrid */
 	.cuda_to_cuda_async = 1,
 #endif
 #endif
-#endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
 	.opencl_to_ram = copy_opencl_to_ram,
@@ -225,11 +216,11 @@ static int pack_matrix_handle(starpu_data_handle_t handle, unsigned node, void *
 	if (ptr != NULL)
 	{
 		uint32_t y;
-		void *matrix = (void *)matrix_interface->ptr;
+		char *matrix = (void *)matrix_interface->ptr;
 
 		*ptr = malloc(*count);
 
-		void *cur = *ptr;
+		char *cur = *ptr;
 		for(y=0 ; y<matrix_interface->ny ; y++)
 		{
 			memcpy(cur, matrix, matrix_interface->nx*matrix_interface->elemsize);
@@ -251,8 +242,8 @@ static int unpack_matrix_handle(starpu_data_handle_t handle, unsigned node, void
 	STARPU_ASSERT(count == matrix_interface->elemsize * matrix_interface->nx * matrix_interface->ny);
 
 	uint32_t y;
-	void *cur = ptr;
-	void *matrix = (void *)matrix_interface->ptr;
+	char *cur = ptr;
+	char *matrix = (void *)matrix_interface->ptr;
 	for(y=0 ; y<matrix_interface->ny ; y++)
 	{
 		memcpy(matrix, cur, matrix_interface->nx*matrix_interface->elemsize);
@@ -379,29 +370,6 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	size_t elemsize = src_matrix->elemsize;
 	cudaError_t cures;
 
-#if 0
-	struct cudaMemcpy3DParms p;
-	memset(&p, 0, sizeof(p));
-
-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * elemsize, src_matrix->ny);
-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * elemsize, dst_matrix->ny);
-	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
-	p.kind = kind;
-
-	if (is_async)
-	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpy3DAsync(&p, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		if (!cures)
-			return -EAGAIN;
-	}
-
-	cures = cudaMemcpy3D(&p);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-#else
-
 	if (is_async)
 	{
 		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
@@ -422,17 +390,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 		if (ret == -EAGAIN) return ret;
 		if (ret) STARPU_CUDA_REPORT_ERROR(cures);
 	}
-#endif
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return 0;
 }
 
-/* XXX this is broken : We need to properly call cudaDeviceEnablePeerAccess(), and avoid crossing NUMA nodes... */
-#ifdef NO_STRIDE
 static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
 {
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	struct starpu_matrix_interface *src_matrix = src_interface;
 	struct starpu_matrix_interface *dst_matrix = dst_interface;
 
@@ -442,70 +408,15 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	int src_dev = _starpu_memory_node_get_devid(src_node);
 	int dst_dev = _starpu_memory_node_get_devid(dst_node);
 
-
-#if 0
-	/* That code is not even working!! */
-	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
-
-	starpu_cuda_set_device(src_dev);
-
-	struct cudaPitchedPtr mem_device1;
-	cures = cudaMalloc3D(&mem_device1, extent);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	starpu_cuda_set_device(dst_dev);
-
-	struct cudaPitchedPtr mem_device2;
-	cures = cudaMalloc3D(&mem_device2, extent);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	struct cudaMemcpy3DPeerParms p;
-	memset(&p, 0, sizeof(p));
-	p.srcDevice = src_dev;
-	p.dstDevice = dst_dev;
-	p.srcPtr = mem_device1;
-	p.dstPtr = mem_device2;
-	p.extent = extent;
-
-	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
-	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
-	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
-	cures = cudaMemcpy3DPeer(&p);
-	if (STARPU_UNLIKELY(cures))
-	        STARPU_CUDA_REPORT_ERROR(cures);
-#endif
-
-#if 0
 	struct cudaMemcpy3DPeerParms p;
 	memset(&p, 0, sizeof(p));
 
 	p.srcDevice = src_dev;
 	p.dstDevice = dst_dev;
-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx * elemsize, src_matrix->ny);
-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx * elemsize, dst_matrix->ny);
+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx, dst_matrix->ny);
 	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
 
-#if 1
-	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
-	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
-	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
-#endif
-
-	cures = cudaMemcpy3DPeerAsync(&p, stream);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-	cudaStreamSynchronize(stream);
-
 	if (is_async)
 	{
 		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
@@ -519,30 +430,13 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-#else
-	/* XXX FIXME !!*/
-	STARPU_ASSERT(src_matrix->nx == src_matrix->ld);
-	STARPU_ASSERT(dst_matrix->nx == dst_matrix->ld);
-
-	if (is_async)
-	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		if (!cures)
-			return -EAGAIN;
-	}
-
-	cures = cudaMemcpyPeer((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-#endif
-
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return 0;
-}
+#else
+	STARPU_ABORT_MSG("CUDA memcpy peer not available, but core triggered one ?!");
 #endif
+}
 
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
@@ -559,11 +453,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 	if (src_node == dst_node)
 		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
 	else
-	{
-		/* XXX not implemented */
-		STARPU_ABORT();
-		return 0;
-	}
+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 0, 0);
 }
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
@@ -576,7 +466,6 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
 }
 
-#ifdef NO_STRIDE
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	if (src_node == dst_node)
@@ -584,7 +473,6 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU
 	else
 		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
 }
-#endif
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL

+ 7 - 7
src/debug/traces/starpu_fxt.c

@@ -2222,7 +2222,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 	{
 		unsigned inputfile;
 
-		uint64_t offsets[64];
+		uint64_t offsets[options->ninputfiles];
 
 		/*
 		 * Find the trace offsets:
@@ -2235,11 +2235,11 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 		 *	- psi_k(x) = x - offset_k
 		 */
 
-		int unique_keys[64];
-		int rank_k[64];
-		uint64_t start_k[64];
-		uint64_t sync_k[64];
-		unsigned sync_k_exists[64];
+		int unique_keys[options->ninputfiles];
+		int rank_k[options->ninputfiles];
+		uint64_t start_k[options->ninputfiles];
+		uint64_t sync_k[options->ninputfiles];
+		unsigned sync_k_exists[options->ninputfiles];
 		uint64_t M = 0;
 
 		unsigned found_one_sync_point = 0;
@@ -2310,7 +2310,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 #endif
 
 			char file_prefix[32];
-			snprintf(file_prefix, 32, "%d_", filerank);
+			snprintf(file_prefix, sizeof(file_prefix), "%d_", filerank);
 
 			options->file_prefix = file_prefix;
 			options->file_offset = offsets[inputfile];

+ 141 - 79
src/drivers/cuda/driver_cuda.c

@@ -49,7 +49,7 @@ static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t in_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static cudaStream_t out_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
-static cudaEvent_t task_events[STARPU_NMAXWORKERS];
+static cudaEvent_t task_events[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
 #endif /* STARPU_USE_CUDA */
 
 void
@@ -221,7 +221,7 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 {
 	cudaError_t cures;
 	int workerid;
-	unsigned i;
+	unsigned i, j;
 
 	/* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
 
@@ -276,7 +276,8 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 	{
 		workerid = worker_set->workers[i].workerid;
 
-		cures = cudaEventCreateWithFlags(&task_events[workerid], cudaEventDisableTiming);
+		for (j = 0; j < STARPU_MAX_PIPELINE; j++)
+			cures = cudaEventCreateWithFlags(&task_events[workerid][j], cudaEventDisableTiming);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -307,7 +308,7 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 static void deinit_context(struct _starpu_worker_set *worker_set)
 {
 	cudaError_t cures;
-	unsigned i;
+	unsigned i, j;
 	int workerid = worker_set->workers[0].workerid;
 	int devid = starpu_worker_get_devid(workerid);
 
@@ -316,7 +317,8 @@ static void deinit_context(struct _starpu_worker_set *worker_set)
 		workerid = worker_set->workers[i].workerid;
 		devid = starpu_worker_get_devid(workerid);
 
-		cudaEventDestroy(task_events[workerid]);
+		for (j = 0; j < STARPU_MAX_PIPELINE; j++)
+			cudaEventDestroy(task_events[workerid][j]);
 		cudaStreamDestroy(streams[workerid]);
 	}
 
@@ -396,7 +398,11 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
+	if (worker->ntasks == 1)
+	{
+		/* We are alone in the pipeline, the kernel will start now, record it */
+		_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
+	}
 
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */
@@ -427,7 +433,9 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 	int profiling = starpu_profiling_status_get();
 
 	_starpu_set_current_task(NULL);
-	worker->current_task = NULL;
+	worker->current_tasks[worker->first_task] = NULL;
+	worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
+	worker->ntasks--;
 
 	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
 
@@ -444,37 +452,93 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 	_starpu_handle_job_termination(j);
 }
 
+/* Execute a job, up to completion for synchronous jobs */
+static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker *worker)
+{
+	int workerid = worker->workerid;
+	int res;
+
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+
+	res = start_job_on_cuda(j, worker);
+
+	if (res)
+	{
+		switch (res)
+		{
+			case -EAGAIN:
+				_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
+				_starpu_push_task_to_workers(task);
+				STARPU_ABORT();
+			default:
+				STARPU_ABORT();
+		}
+	}
+
+#ifndef STARPU_SIMGRID
+	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+	{
+		/* Record event to synchronize with task termination later */
+		cudaEventRecord(task_events[workerid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE], starpu_cuda_get_local_stream());
+#ifdef STARPU_USE_FXT
+		int k;
+		for (k = 0; k < (int) worker->set->nworkers; k++)
+			if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
+				break;
+		if (k == (int) worker->set->nworkers)
+			/* Everybody busy */
+			_STARPU_TRACE_START_EXECUTING()
+#endif
+	}
+	else
+#else
+#ifdef STARPU_DEVEL
+#warning No CUDA asynchronous execution with simgrid yet.
+#endif
+#endif
+	/* Synchronous execution */
+	{
+#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
+		STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
+#endif
+		finish_job_on_cuda(j, worker);
+	}
+}
+
 /* XXX Should this be merged with _starpu_init_cuda ? */
 int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 {
-	struct _starpu_worker *worker = &worker_set->workers[0];
-	unsigned devid = worker->devid;
+	struct _starpu_worker *worker0 = &worker_set->workers[0];
+	unsigned devid = worker0->devid;
 	unsigned i;
 
-	_starpu_worker_start(worker, _STARPU_FUT_CUDA_KEY);
+	_starpu_worker_start(worker0, _STARPU_FUT_CUDA_KEY);
 
 #ifdef STARPU_USE_FXT
-	unsigned memnode = worker->memory_node;
+	unsigned memnode = worker0->memory_node;
 	for (i = 1; i < worker_set->nworkers; i++)
 	{
-		struct _starpu_worker *_worker = &worker_set->workers[i];
-		_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, _worker->workerid, devid, memnode);
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, worker->workerid, devid, memnode);
 	}
 #endif
 
 #ifndef STARPU_SIMGRID
 	init_context(worker_set, devid);
+
+	if (worker_set->nworkers > 1 && props[devid].concurrentKernels == 0)
+		_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
 #endif
 
 	_starpu_cuda_limit_gpu_mem_if_needed(devid);
-	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_cuda_get_global_mem_size(devid));
+	_starpu_memory_manager_set_global_memory_size(worker0->memory_node, _starpu_cuda_get_global_mem_size(devid));
 
-	_starpu_malloc_init(worker->memory_node);
+	_starpu_malloc_init(worker0->memory_node);
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(worker->config, worker->bindid);
+	_starpu_bind_thread_on_cpu(worker0->config, worker0->bindid);
 
-	worker->status = STATUS_UNKNOWN;
+	worker0->status = STATUS_UNKNOWN;
 
 	float size = (float) global_mem[devid] / (1<<30);
 #ifdef STARPU_SIMGRID
@@ -485,29 +549,31 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 	strncpy(devname, props[devid].name, 128);
 #endif
 
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
-	if (props[devid].pciDomainID)
-		snprintf(worker->name, sizeof(worker->name), "CUDA %u (%s %.1f GiB %04x:%02x:%02x.0)", devid, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
-	else
+		if (props[devid].pciDomainID)
+			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, i, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
+		else
 #endif
-		snprintf(worker->name, sizeof(worker->name), "CUDA %u (%s %.1f GiB %02x:%02x.0)", devid, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
+			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %02x:%02x.0)", devid, i, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
 #else
-	snprintf(worker->name, sizeof(worker->name), "CUDA %u (%s %.1f GiB)", devid, devname, size);
+		snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB)", devid, i, devname, size);
 #endif
-	snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u", devid);
-	_STARPU_DEBUG("cuda (%s) dev id %u thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
+		snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u.%u", devid, i);
+		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
 
-	for (i = 0; i < worker_set->nworkers; i++)
-	{
+		worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
 		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
 	}
 
 	/* tell the main thread that this one is ready */
-	STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
-	worker->worker_is_initialized = 1;
-	STARPU_PTHREAD_COND_SIGNAL(&worker->ready_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker0->mutex);
+	worker0->worker_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker0->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker0->mutex);
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
@@ -536,16 +602,17 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		struct _starpu_worker *worker = &worker_set->workers[i];
 		int workerid = worker->workerid;
 
-		task = worker->current_task;
-
-		if (!task)
+		if (!worker->ntasks)
 		{
 			idle++;
+			/* Even nothing to test */
 			continue;
 		}
 
+		task = worker->current_tasks[worker->first_task];
+
 		/* On-going asynchronous task, check for its termination first */
-		cudaError_t cures = cudaEventQuery(task_events[workerid]);
+		cudaError_t cures = cudaEventQuery(task_events[workerid][worker->first_task]);
 
 		if (cures != cudaSuccess)
 		{
@@ -556,17 +623,44 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			/* Asynchronous task completed! */
 			_starpu_set_local_worker_key(worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
-			idle++;
+			/* See next task if any */
+			if (worker->ntasks)
+			{
+				task = worker->current_tasks[worker->first_task];
+				j = _starpu_get_job_associated_to_task(task);
+				if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+				{
+					/* An asynchronous task, it was already
+					 * queued, it's now running, record its start time.  */
+					_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get());
+				}
+				else
+				{
+					/* A synchronous task, we have finished
+					 * flushing the pipeline, we can now at
+					 * last execute it.  */
+
+					_STARPU_TRACE_END_PROGRESS(memnode);
+					_STARPU_TRACE_EVENT("sync_task");
+					execute_job_on_cuda(task, worker);
+					_STARPU_TRACE_EVENT("end_sync_task");
+					_STARPU_TRACE_START_PROGRESS(memnode);
+					worker->pipeline_stuck = 0;
+				}
+			}
 #ifdef STARPU_USE_FXT
 			int k;
 			for (k = 0; k < (int) worker_set->nworkers; k++)
-				if (worker_set->workers[k].current_task)
+				if (worker_set->workers[k].ntasks)
 					break;
 			if (k == (int) worker_set->nworkers)
 				/* Everybody busy */
 				_STARPU_TRACE_END_EXECUTING()
 #endif
 		}
+
+		if (worker->ntasks < worker->pipeline_length)
+			idle++;
 	}
 
 	if (!idle)
@@ -589,13 +683,11 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	for (i = 0; i < (int) worker_set->nworkers; i++)
 	{
 		struct _starpu_worker *worker = &worker_set->workers[i];
-		int workerid = worker->workerid;
 
 		task = tasks[i];
 		if (!task)
 			continue;
 
-		_starpu_set_local_worker_key(worker);
 
 		j = _starpu_get_job_associated_to_task(task);
 
@@ -603,54 +695,24 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		if (!_STARPU_CUDA_MAY_PERFORM(j))
 		{
 			/* this is neither a cuda or a cublas task */
+			worker->ntasks--;
 			_starpu_push_task_to_workers(task);
 			continue;
 		}
 
-		_STARPU_TRACE_END_PROGRESS(memnode);
-		res = start_job_on_cuda(j, worker);
-
-		if (res)
+		if (worker->ntasks > 1 && !(task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC))
 		{
-			switch (res)
-			{
-				case -EAGAIN:
-					_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
-					_starpu_push_task_to_workers(task);
-					STARPU_ABORT();
-				default:
-					STARPU_ABORT();
-			}
+			/* We have to execute a non-asynchronous task but we
+			 * still have tasks in the pipeline...  Record it to
+			 * prevent more tasks from coming, and do it later */
+			worker->pipeline_stuck = 1;
+			continue;
 		}
 
-#ifndef STARPU_SIMGRID
-		if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
-		{
-			/* Record event to synchronize with task termination later */
-			cudaEventRecord(task_events[workerid], starpu_cuda_get_local_stream());
-#ifdef STARPU_USE_FXT
-			int k;
-			for (k = 0; k < (int) worker_set->nworkers; k++)
-				if (worker_set->workers[k].current_task)
-					break;
-			if (k < (int) worker_set->nworkers)
-				/* Everybody busy */
-				_STARPU_TRACE_START_EXECUTING()
-#endif
-		}
-		else
-#else
-#ifdef STARPU_DEVEL
-#warning No CUDA asynchronous execution with simgrid yet.
-#endif
-#endif
-		/* Synchronous execution */
-		{
-#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
-			STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
-#endif
-			finish_job_on_cuda(j, worker);
-		}
+		_starpu_set_local_worker_key(worker);
+
+		_STARPU_TRACE_END_PROGRESS(memnode);
+		execute_job_on_cuda(task, worker);
 		_STARPU_TRACE_START_PROGRESS(memnode);
 	}
 

+ 17 - 4
src/drivers/driver_common/driver_common.c

@@ -275,6 +275,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 	STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 	struct starpu_task *task;
 	unsigned needed = 1;
+
 	_starpu_worker_set_status_scheduling(workerid);
 	while(needed)
 	{
@@ -328,7 +329,10 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 		needed = !needed;
 	}
 
-	task = _starpu_pop_task(worker);
+	if (worker->pipeline_length && (worker->ntasks == worker->pipeline_length || worker->pipeline_stuck))
+		task = NULL;
+	else
+		task = _starpu_pop_task(worker);
 
 	if (task == NULL)
 	{
@@ -394,8 +398,11 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 	/*for each worker*/
 	for (i = 0; i < nworkers; i++)
 	{
-		/*if the worker is already executinf a task then */
-		if(workers[i].current_task)
+		/*if the worker is already executing a task then */
+		if((workers[i].pipeline_length == 0 && workers[i].current_task)
+			|| (workers[i].pipeline_length != 0 &&
+				(workers[i].ntasks == workers[i].pipeline_length
+				 || workers[i].pipeline_stuck)))
 		{
 			tasks[i] = NULL;
 		}
@@ -415,7 +422,13 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 				count ++;
 				j = _starpu_get_job_associated_to_task(tasks[i]);
 				is_parallel_task = (j->task_size > 1);
-				workers[i].current_task = j->task;
+				if (workers[i].pipeline_length)
+				{
+					workers[i].current_tasks[(workers[i].first_task + workers[i].ntasks)%STARPU_MAX_PIPELINE] = tasks[i];
+					workers[i].ntasks++;
+				}
+				else
+					workers[i].current_task = j->task;
 				/* Get the rank in case it is a parallel task */
 				if (is_parallel_task)
 				{

+ 1 - 1
src/drivers/driver_common/driver_common.h

@@ -18,8 +18,8 @@
 #ifndef __DRIVER_COMMON_H__
 #define __DRIVER_COMMON_H__
 
-#include <sys/time.h>
 #include <starpu.h>
+#include <starpu_util.h>
 #include <core/jobs.h>
 #include <common/utils.h>
 

+ 107 - 56
src/drivers/opencl/driver_opencl.c

@@ -50,7 +50,7 @@ static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
-static cl_event task_events[STARPU_MAXOPENCLDEVS];
+static cl_event task_events[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
 #endif
 
 void
@@ -376,7 +376,7 @@ cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
 		err = starpu_opencl_copy_opencl_to_ram(
 				(cl_mem) src, src_node,
-				(void*) dst + dst_offset, dst_node,
+				(void*) (dst + dst_offset), dst_node,
 				size, src_offset, event, &ret);
 		if (STARPU_UNLIKELY(err))
 			STARPU_OPENCL_REPORT_ERROR(err);
@@ -384,7 +384,7 @@ cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
 		err = starpu_opencl_copy_ram_to_opencl(
-				(void*) src + src_offset, src_node,
+				(void*) (src + src_offset), src_node,
 				(cl_mem) dst, dst_node,
 				size, dst_offset, event, &ret);
 		if (STARPU_UNLIKELY(err))
@@ -564,6 +564,7 @@ static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
 #endif
 static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker);
 static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker);
+static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker);
 
 int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 {
@@ -596,6 +597,8 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 	snprintf(worker->name, sizeof(worker->name), "OpenCL %u (%s %.1f GiB)", devid, devname, size);
 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
 
+	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
+
 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
 
 	_STARPU_TRACE_WORKER_INIT_END(worker->workerid);
@@ -616,19 +619,19 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 
 	struct _starpu_job *j;
 	struct starpu_task *task;
-	int res;
 
 #ifndef STARPU_SIMGRID
-	task = starpu_task_get_current();
-
-	if (task)
+	if (worker->ntasks)
 	{
 		cl_int status;
 		size_t size;
 		int err;
+
 		/* On-going asynchronous task, check for its termination first */
 
-		err = clGetEventInfo(task_events[worker->devid], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
+		task = worker->current_tasks[worker->first_task];
+
+		err = clGetEventInfo(task_events[worker->devid][worker->first_task], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
 		STARPU_ASSERT(size == sizeof(cl_int));
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -640,16 +643,38 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 			return 0;
 		}
 
+		task_events[worker->devid][worker->first_task] = 0;
+
 		/* Asynchronous task completed! */
-		_STARPU_TRACE_END_EXECUTING();
 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), worker);
+		/* See next task if any */
+		if (worker->ntasks)
+		{
+			task = worker->current_tasks[worker->first_task];
+			j = _starpu_get_job_associated_to_task(task);
+			if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+			{
+				/* An asynchronous task, it was already queued,
+				 * it's now running, record its start time.  */
+				_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get());
+			}
+			else
+			{
+				/* A synchronous task, we have finished flushing the pipeline, we can now at last execute it.  */
+				_STARPU_TRACE_END_PROGRESS(memnode);
+				_STARPU_TRACE_EVENT("sync_task");
+				_starpu_opencl_execute_job(task, worker);
+				_STARPU_TRACE_EVENT("end_sync_task");
+				_STARPU_TRACE_START_PROGRESS(memnode);
+				worker->pipeline_stuck = 0;
+			}
+		}
+		_STARPU_TRACE_END_EXECUTING();
 	}
 #endif /* STARPU_SIMGRID */
 
 	__starpu_datawizard_progress(memnode, 1, 1);
 
-	_STARPU_TRACE_END_PROGRESS(memnode);
-
 	task = _starpu_get_worker_task(worker, workerid, memnode);
 
 	if (task == NULL)
@@ -665,52 +690,20 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 		return 0;
 	}
 
-	res = _starpu_opencl_start_job(j, worker);
+	worker->current_tasks[(worker->first_task  + worker->ntasks)%STARPU_MAX_PIPELINE] = task;
+	worker->ntasks++;
 
-	if (res)
+	if (worker->ntasks > 1 && !(task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC))
 	{
-		switch (res)
-		{
-			case -EAGAIN:
-				_STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
-				_starpu_push_task_to_workers(task);
-				STARPU_ABORT();
-				return 0;
-			default:
-				STARPU_ABORT();
-		}
+		/* We have to execute a non-asynchronous task but we
+		 * still have tasks in the pipeline...  Record it to
+		 * prevent more tasks from coming, and do it later */
+		worker->pipeline_stuck = 1;
+		return 0;
 	}
 
-#ifndef STARPU_SIMGRID
-	if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
-	{
-		/* Record event to synchronize with task termination later */
-		int err;
-		cl_command_queue queue;
-		starpu_opencl_get_queue(worker->devid, &queue);
-		/* the function clEnqueueMarker is deprecated from
-		 * OpenCL version 1.2. We would like to use the new
-		 * function clEnqueueMarkerWithWaitList. We could do
-		 * it by checking its availability through our own
-		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
-		 * and the OpenCL macro CL_VERSION_1_2. However these
-		 * 2 macros detect the function availability in the
-		 * ICD and not in the device implementation.
-		 */
-		err = clEnqueueMarker(queue, &task_events[worker->devid]);
-		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
-		_STARPU_TRACE_START_EXECUTING();
-	}
-	else
-#else
-#ifdef STARPU_DEVEL
-#warning No OpenCL asynchronous execution with simgrid yet.
-#endif
-#endif
-	/* Synchronous execution */
-	{
-		_starpu_opencl_stop_job(j, worker);
-	}
+	_STARPU_TRACE_END_PROGRESS(memnode);
+	_starpu_opencl_execute_job(task, worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 
 	return 0;
@@ -816,7 +809,6 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 	STARPU_ASSERT(cl);
 
 	_starpu_set_current_task(j->task);
-	worker->current_task = j->task;
 
 	ret = _starpu_fetch_task_input(j);
 	if (ret != 0)
@@ -827,7 +819,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
+	if (worker->ntasks == 1)
+	{
+		/* We are alone in the pipeline, the kernel will start now, record it */
+		_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
+	}
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
@@ -865,7 +861,9 @@ static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker
 	int profiling = starpu_profiling_status_get();
 
 	_starpu_set_current_task(NULL);
-	worker->current_task = NULL;
+	worker->current_tasks[worker->first_task] = NULL;
+	worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
+	worker->ntasks--;
 
 	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
 
@@ -882,6 +880,59 @@ static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker
 
 }
 
+static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker)
+{
+	int res;
+
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+
+	res = _starpu_opencl_start_job(j, worker);
+
+	if (res)
+	{
+		switch (res)
+		{
+			case -EAGAIN:
+				_STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
+				_starpu_push_task_to_workers(task);
+				STARPU_ABORT();
+			default:
+				STARPU_ABORT();
+		}
+	}
+
+#ifndef STARPU_SIMGRID
+	if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+	{
+		/* Record event to synchronize with task termination later */
+		int err;
+		cl_command_queue queue;
+		starpu_opencl_get_queue(worker->devid, &queue);
+		/* the function clEnqueueMarker is deprecated from
+		 * OpenCL version 1.2. We would like to use the new
+		 * function clEnqueueMarkerWithWaitList. We could do
+		 * it by checking its availability through our own
+		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
+		 * and the OpenCL macro CL_VERSION_1_2. However these
+		 * 2 macros detect the function availability in the
+		 * ICD and not in the device implementation.
+		 */
+		err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]);
+		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+		_STARPU_TRACE_START_EXECUTING();
+	}
+	else
+#else
+#ifdef STARPU_DEVEL
+#warning No OpenCL asynchronous execution with simgrid yet.
+#endif
+#endif
+	/* Synchronous execution */
+	{
+		_starpu_opencl_stop_job(j, worker);
+	}
+}
+
 #ifdef STARPU_USE_OPENCL
 int _starpu_run_opencl(struct _starpu_worker *workerarg)
 {

+ 5 - 2
src/drivers/opencl/driver_opencl_utils.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,9 +20,12 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include <unistd.h>
 #include <sys/types.h>
 
+#include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <starpu_opencl.h>
 #include <starpu_profiling.h>
 #include <core/workers.h>

+ 2 - 2
src/profiling/bound.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -227,7 +227,7 @@ static void new_task(struct _starpu_job *j)
 	t->tag_id = j->task->tag_id;
 	t->use_tag = j->task->use_tag;
 	t->cl = j->task->cl;
-	t->footprint = _starpu_compute_buffers_footprint(NULL, STARPU_CPU_WORKER, 0, j);
+	t->footprint = _starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, STARPU_CPU_WORKER, 0, j);
 	t->priority = j->task->priority;
 	t->deps = NULL;
 	t->depsn = 0;

+ 2 - 2
src/profiling/profiling.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,9 +18,9 @@
 #ifndef __PROFILING_H__
 #define __PROFILING_H__
 
-#include <sys/time.h>
 #include <starpu.h>
 #include <starpu_profiling.h>
+#include <starpu_util.h>
 #include <common/config.h>
 
 /* Create a task profiling info structure (with the proper time stamps) in case

+ 9 - 4
src/top/starpu_top_connection.c

@@ -15,13 +15,16 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#define WINVER WindowsXP
-
 #include <starpu_config.h>
 
-#ifdef STARPU_HAVE_WINDOWS
+#ifdef __MINGW__
 #  include <w32api.h>
+#  define WINVER WindowsXP
+#endif
+
+#ifdef STARPU_HAVE_WINDOWS
 #  include <ws2tcpip.h>
+#  include <io.h>
 #else
 #  include <sys/socket.h>
 #  include <netinet/in.h>
@@ -37,7 +40,9 @@
 #include <string.h>
 #include <sys/types.h>
 #include <stdlib.h>
+#ifdef HAVE_UNISTD_H
 #include <unistd.h>
+#endif
 
 const char *STARPU_TOP_PORT = "2011";
 const int STARPU_TOP_BUFFER_SIZE=1024;
@@ -122,7 +127,7 @@ void _starpu_top_communications_threads_launcher(void)
    	}
   	int sock=socket(ans->ai_family, ans->ai_socktype, ans->ai_protocol);
 	int optval = 1;
-	setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval));
+	setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void*) &optval, sizeof(optval));
 
 	if (bind(sock, ans->ai_addr, ans->ai_addrlen) < 0)
 	{

+ 0 - 1
src/top/starpu_top_task.c

@@ -22,7 +22,6 @@
 #include <core/task.h>
 #include <stdio.h>
 #include <string.h>
-#include <sys/time.h>
 #include <common/timing.h>
 
 /********************************************

+ 4 - 5
tests/loader.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -136,14 +136,13 @@ static int _decode(char **src, char *motif, const char *value)
 	found = strstr(*src, motif);
 	if (found == NULL) return 0;
 
-	char *new_src = malloc((strlen(*src)+strlen(value))*sizeof(char));
-	strcpy(new_src, "");
+	char *new_src = malloc(strlen(*src)-strlen(motif)+strlen(value)+1);
 
-	strncat(new_src, *src, strlen(*src)-strlen(found));
+	strncpy(new_src, *src, found - *src);
 	strcat(new_src, value);
 	strcat(new_src, found+strlen(motif));
 
-	*src = strdup(new_src);
+	*src = new_src;
 	return 1;
 }
 

+ 6 - 0
tests/main/driver_api/run_driver.c

@@ -80,6 +80,8 @@ test_cpu(void)
 	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)
 	{
 		FPRINTF(stderr, "WARNING: No CPU worker found\n");
+		if (ret == 0)
+			starpu_shutdown();
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -138,6 +140,8 @@ test_cuda(void)
 	if (ret == -ENODEV || starpu_cuda_worker_get_count() == 0)
 	{
 		FPRINTF(stderr, "WARNING: No CUDA worker found\n");
+		if (ret == 0)
+			starpu_shutdown();
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -222,6 +226,8 @@ test_opencl(void)
 	if (ret == -ENODEV || starpu_opencl_worker_get_count() == 0)
 	{
 		FPRINTF(stderr, "WARNING: No OpenCL workers found\n");
+		if (ret == 0)
+			starpu_shutdown();
 		return STARPU_TEST_SKIPPED;
 	}
 

+ 34 - 8
tests/overlap/gpu_concurrency.c

@@ -24,27 +24,48 @@
 #include <common/thread.h>
 
 #define NITERS 1000000
-#define NTASKS 128
+#define NTASKS 64
+#define SYNC 16
 
 #ifdef STARPU_USE_CUDA
 extern void long_kernel_cuda(unsigned long niters);
-void codelet_long_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+
+void codelet_long_kernel_async(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	long_kernel_cuda(NITERS);
+}
+
+void codelet_long_kernel_sync(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	long_kernel_cuda(NITERS);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
-static struct starpu_perfmodel model =
+static struct starpu_perfmodel model_async =
 {
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "long_kernel",
+	.symbol = "long_kernel_async",
 };
 
-static struct starpu_codelet cl =
+static struct starpu_perfmodel model_sync =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "long_kernel_sync",
+};
+
+static struct starpu_codelet cl_async =
 {
-	.cuda_funcs = {codelet_long_kernel, NULL},
+	.cuda_funcs = {codelet_long_kernel_async, NULL},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 0,
-	.model =  &model
+	.model =  &model_async,
+};
+
+static struct starpu_codelet cl =
+{
+	.cuda_funcs = {codelet_long_kernel_sync, NULL},
+	.nbuffers = 0,
+	.model =  &model_sync,
 };
 #endif
 
@@ -53,6 +74,7 @@ int main(int argc, char **argv)
 #ifndef STARPU_USE_CUDA
 	return STARPU_TEST_SKIPPED;
 #else
+	setenv("STARPU_NWORKER_PER_CUDA", "4", 1);
 	int ret = starpu_initialize(NULL, &argc, &argv);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -66,7 +88,11 @@ int main(int argc, char **argv)
 	for (iter = 0; iter < NTASKS; iter++)
 	{
 		struct starpu_task *task = starpu_task_create();
-		task->cl = &cl;
+
+		if (!(iter % SYNC))
+			task->cl = &cl;
+		else
+			task->cl = &cl_async;
 
 		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;

+ 41 - 25
tools/gdbinit

@@ -1,7 +1,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010-2013  Université de Bordeaux 1
+# Copyright (C) 2010-2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -30,7 +30,7 @@ define starpu-print-job
     printf "\tsubmitted:\t\t\t<%d>\n", $job->submitted
     printf "\tterminated:\t\t\t<%d>\n", $job->terminated
     printf "\tjob_id:\t\t\t\t<%d>\n", $job->job_id
-    if $job->task
+    if $job->task && $job->task->name
         printf "\tname:\t\t\t\t<%s>\n", $job->task->name
     end
   end
@@ -71,7 +71,9 @@ define starpu-print-task
   end
 
   printf "StarPU Task (%p)\n", $task
-  printf "\tname:\t\t\t\t<%s>\n", $task->name
+  if $task->name
+    printf "\tname:\t\t\t\t<%s>\n", $task->name
+  end
   printf "\tcodelet:\t\t\t<%p>\n", $task->cl
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
@@ -90,6 +92,32 @@ define starpu-print-task
   end
 end
 
+define starpu-print-task-and-successor
+  set language c
+  set $t = (struct starpu_task *) ($arg0)
+  starpu-print-task $t
+  set $j = (struct _starpu_job *) $t->starpu_private
+  set $nsuccs = $j->job_successors.nsuccs
+  set $i = 0
+  while $i < $nsuccs
+    set $cg = $j->job_successors.succ[$i]
+    if ($cg->cg_type == 1)
+      # STARPU_CG_APPS
+      printf "waited for by application"
+    end
+    if ($cg->cg_type == 2)
+      # STARPU_CG_TAG
+      printf "will produce tag %x\n", $cg->succ.tag
+    end
+    if ($cg->cg_type == 4)
+      # STARPU_CG_TASK
+      printf "dep of task %p\n", $cg->succ.job
+      starpu-print-task $cg->succ.job->task
+    end
+    set $i = $i + 1
+  end
+end
+
 document starpu-print-task
 Prints a StarPU task
 end
@@ -150,30 +178,18 @@ define starpu-tasks
   printf "Tasks being run:\n"
   set $n = 0
   while $n < config.topology.nworkers
+    printf "worker %d %s:\n", $n, config.workers[$n].short_name
+    if config.workers[$n].pipeline_length > 0
+      set $m = 0
+      while $m < config.workers[$n].ntasks
+        set $t = config.workers[$n].current_tasks[(config.workers[$n].first_task + $m) % (sizeof(config.workers[$n].current_tasks)/sizeof(config.workers[$n].current_tasks[0]))]
+        starpu-print-task-and-successor $t
+        set $m = $m + 1
+      end
+    end
     set $task = config.workers[$n].current_task
     if ($task)
-      printf "worker %d:\n", $n
-      starpu-print-task $task
-      set $j = (struct _starpu_job *) $task->starpu_private
-      set $nsuccs = $j->job_successors.nsuccs
-      set $i = 0
-      while $i < $nsuccs
-        set $cg = $j->job_successors.succ[$i]
-	if ($cg->cg_type == 1)
-	  # STARPU_CG_APPS
-	  printf "waited for by application"
-	end
-	if ($cg->cg_type == 2)
-	  # STARPU_CG_TAG
-	  printf "will produce tag %x\n", $cg->succ.tag
-	end
-	if ($cg->cg_type == 4)
-	  # STARPU_CG_TASK
-	  printf "dep of task %p\n", $cg->succ.job
-	  starpu-print-task $cg->succ.job->task
-	end
-        set $i = $i + 1
-      end
+      starpu-print-task-and-successor $task
     end
     set $n = $n + 1
   end

+ 1 - 1
tools/starpu_calibrate_bus.c

@@ -17,7 +17,7 @@
 #include <config.h>
 #include <starpu.h>
 #include <stdio.h>
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 

+ 2 - 2
tools/starpu_perfmodel_display.c

@@ -24,7 +24,7 @@
 
 #include <starpu.h>
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 
@@ -141,7 +141,7 @@ static void parse_args(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 	WSADATA wsadata;
 	WSAStartup(MAKEWORD(1,0), &wsadata);
 #endif

+ 2 - 2
tools/starpu_perfmodel_plot.c

@@ -31,7 +31,7 @@
 #include <core/perfmodel/perfmodel.h> // we need to browse the list associated to history-based models
 #include <core/workers.h>
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 
@@ -427,7 +427,7 @@ int main(int argc, char **argv)
 	char gnuplot_file_name[256];
 	struct _perfmodel_plot_options options;
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 	WSADATA wsadata;
 	WSAStartup(MAKEWORD(1,0), &wsadata);
 #endif