Samuel Thibault лет назад: 10
Родитель
Сommit
0eb9ac3d71
75 измененных файлов с 1018 добавлено и 625 удалено
  1. 10 1
      ChangeLog
  2. 20 0
      doc/doxygen/chapters/40environment_variables.doxy
  3. 4 0
      doc/doxygen/chapters/api/data_interfaces.doxy
  4. 5 5
      examples/audio/starpu_audio_processing.c
  5. 5 6
      examples/axpy/axpy.c
  6. 6 6
      examples/cg/cg.c
  7. 11 11
      examples/heat/dw_factolu.c
  8. 6 6
      examples/heat/dw_factolu_grain.c
  9. 6 6
      examples/heat/dw_factolu_tag.c
  10. 5 6
      examples/incrementer/incrementer.c
  11. 6 6
      examples/lu/lu_example.c
  12. 5 5
      examples/lu/xlu.c
  13. 5 5
      examples/lu/xlu_implicit.c
  14. 7 6
      examples/lu/xlu_implicit_pivot.c
  15. 5 5
      examples/lu/xlu_pivot.c
  16. 5 5
      examples/mandelbrot/mandelbrot.c
  17. 5 2
      examples/pi/SobolQRNG/sobol_gold.c
  18. 1 1
      examples/pi/SobolQRNG/sobol_primitives.c
  19. 6 6
      examples/pi/pi.c
  20. 5 5
      examples/pi/pi_redux.c
  21. 10 10
      examples/ppm_downscaler/yuv_downscaler.c
  22. 2 2
      examples/sched_ctx/sched_ctx_without_sched_policy.c
  23. 18 18
      examples/spmv/dw_block_spmv.c
  24. 4 4
      examples/spmv/spmv.c
  25. 2 0
      include/pthread_win32/pthread.h
  26. 0 8
      include/starpu_config.h.in
  27. 2 0
      include/starpu_data_interfaces.h
  28. 4 1
      include/starpu_profiling.h
  29. 2 2
      include/starpu_task.h
  30. 7 7
      include/starpu_thread.h
  31. 5 5
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  32. 2 2
      mpi/examples/mpi_lu/plu_outofcore_example.c
  33. 6 6
      mpi/examples/mpi_lu/pxlu.c
  34. 5 5
      mpi/examples/mpi_lu/pxlu_implicit.c
  35. 4 4
      mpi/src/starpu_mpi_task_insert.c
  36. 5 1
      src/common/fxt.c
  37. 2 2
      src/common/fxt.h
  38. 6 2
      src/common/thread.c
  39. 3 2
      src/common/timing.c
  40. 88 3
      src/common/utils.c
  41. 9 0
      src/common/utils.h
  42. 1 1
      src/core/combined_workers.c
  43. 26 2
      src/core/perfmodel/perfmodel_bus.c
  44. 19 3
      src/core/perfmodel/perfmodel_history.c
  45. 8 2
      src/core/perfmodel/perfmodel_nan.c
  46. 2 2
      src/core/sched_ctx.c
  47. 3 0
      src/core/simgrid.c
  48. 4 0
      src/core/task.c
  49. 2 2
      src/core/topology.c
  50. 5 1
      src/core/workers.c
  51. 8 1
      src/core/workers.h
  52. 33 2
      src/datawizard/coherency.c
  53. 4 1
      src/datawizard/filters.c
  54. 2 0
      src/datawizard/interfaces/block_interface.c
  55. 4 2
      src/datawizard/interfaces/data_interface.c
  56. 7 119
      src/datawizard/interfaces/matrix_interface.c
  57. 2 1
      src/datawizard/memory_nodes.c
  58. 11 7
      src/debug/traces/starpu_fxt.c
  59. 158 96
      src/drivers/cuda/driver_cuda.c
  60. 48 35
      src/drivers/driver_common/driver_common.c
  61. 140 90
      src/drivers/opencl/driver_opencl.c
  62. 2 2
      src/profiling/bound.c
  63. 7 2
      src/top/starpu_top_connection.c
  64. 14 18
      src/worker_collection/worker_tree.c
  65. 6 0
      tests/main/driver_api/run_driver.c
  66. 41 0
      tests/main/subgraph_repeat_regenerate_tag.c
  67. 28 0
      tests/main/subgraph_repeat_tag.c
  68. 2 2
      tests/microbenchs/matrix_as_vector.c
  69. 34 8
      tests/overlap/gpu_concurrency.c
  70. 41 25
      tools/gdbinit
  71. 1 1
      tools/starpu_calibrate_bus.c
  72. 4 4
      tools/starpu_fxt_stats.c
  73. 18 12
      tools/starpu_fxt_tool.c
  74. 2 2
      tools/starpu_perfmodel_display.c
  75. 17 5
      tools/starpu_perfmodel_plot.c

+ 10 - 1
ChangeLog

@@ -51,6 +51,8 @@ New features:
     CUDA and OpenCL kernel execution.
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
+  * Add CUDA and OpenCL kernel submission pipelining, to overlap costs and allow
+    concurrent kernel execution on Fermi cards.
   * New locality work stealing scheduler (lws).
   * Add STARPU_VARIABLE_NBUFFERS to be set in cl.nbuffers, and nbuffers and
     modes field to the task structure, which permit to define codelets taking a
@@ -104,8 +106,15 @@ The scheduling context release
 New features:
   * One can register an existing on-GPU buffer to be used by a handle.
   * Add the starpu_paje_summary statistics tool.
+  * Enable gpu-gpu transfers for matrices.
+  * Let interfaces declare which transfers they allow with the can_copy
+    methode.
 
-StarPU 1.1.2 (svn revision xxx)
+Small changes:
+  * Lock performance model files while writing and reading them to avoid
+    issues on parallel launches, MPI runs notably.
+
+StarPU 1.1.2 (svn revision 13011)
 ==============================================
 The scheduling context release
 

+ 20 - 0
doc/doxygen/chapters/40environment_variables.doxy

@@ -51,6 +51,16 @@ Specify the number of workers per CUDA device, and thus the number of kernels
 which will be concurrently running on the devices. The default value is 1.
 </dd>
 
+<dt>STARPU_CUDA_PIPELINE</dt>
+<dd>
+\anchor STARPU_CUDA_PIPELINE
+\addindex __env__STARPU_CUDA_PIPELINE
+Specify how many asynchronous tasks are submitted in advance on CUDA
+devices. This for instance permits to overlap task management with the execution
+of previous tasks, but it also allows concurrent execution on Fermi cards, which
+otherwise bring spurious synchronizations. The default is 2.
+</dd>
+
 <dt>STARPU_NOPENCL</dt>
 <dd>
 \anchor STARPU_NOPENCL
@@ -58,6 +68,16 @@ which will be concurrently running on the devices. The default value is 1.
 OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
 </dd>
 
+<dt>STARPU_OPENCL_PIPELINE</dt>
+<dd>
+\anchor STARPU_OPENCL_PIPELINE
+\addindex __env__STARPU_OPENCL_PIPELINE
+Specify how many asynchronous tasks are submitted in advance on OpenCL
+devices. This for instance permits to overlap task management with the execution
+of previous tasks, but it also allows concurrent execution on Fermi cards, which
+otherwise bring spurious synchronizations. The default is 2.
+</dd>
+
 <dt>STARPU_NMICDEVS</dt>
 <dd>
 \anchor STARPU_NMICDEVS

+ 4 - 0
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -55,6 +55,10 @@ provided, it will be used by default if no more specific method is
 provided. It can still be useful to provide more specific method in
 case of e.g. available particular CUDA or OpenCL support.
 \ingroup API_Data_Interfaces
+\var starpu_data_copy_methods::can_copy
+If defined, allows the interface to declare whether it supports transferring
+from \p src_interface on node \p src_node to \p dst_interface on node \p. If not
+defined, it is assumed that the interface supports all transfers.
 \var starpu_data_copy_methods::ram_to_ram
 Define how to copy data from the \p src_interface interface on the \p
 src_node CPU node to the \p dst_interface interface on the \p dst_node

+ 5 - 5
examples/audio/starpu_audio_processing.c

@@ -59,8 +59,8 @@ float *A;
 starpu_data_handle_t A_handle;
 
 /* For performance evaluation */
-static struct timeval start;
-static struct timeval end;
+static double start;
+static double end;
 static unsigned task_per_worker[STARPU_NMAXWORKERS] = {0};
 
 /* 
@@ -426,7 +426,7 @@ int main(int argc, char **argv)
 	for (iter = 0; iter < niter; iter++)
 		starpu_data_set_wt_mask(starpu_data_get_sub_data(A_handle, 1, iter), 1<<0);
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (iter = 0; iter < niter; iter++)
 	{
@@ -435,9 +435,9 @@ int main(int argc, char **argv)
 
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	fprintf(stderr, "Computation took %2.2f ms\n", timing/1000);
 
 	int worker;

+ 5 - 6
examples/axpy/axpy.c

@@ -166,10 +166,10 @@ int main(int argc, char **argv)
 	starpu_data_partition(_handle_x, &block_filter);
 	starpu_data_partition(_handle_y, &block_filter);
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	unsigned b;
 	for (b = 0; b < NBLOCKS; b++)
@@ -202,9 +202,8 @@ enodev:
 	starpu_data_unregister(_handle_x);
 	starpu_data_unregister(_handle_y);
 
-	gettimeofday(&end, NULL);
-        double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
-                                        (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+        double timing = end - start;
 
 	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
 

+ 6 - 6
examples/cg/cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -294,9 +294,9 @@ static int cg(void)
 	FPRINTF(stderr, "*************** INITIAL ************ \n");
 	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
 
-	struct timeval start;
-	struct timeval end;
-	gettimeofday(&start, NULL);
+	double start;
+	double end;
+	start = starpu_timing_now();
 
 	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
 	{
@@ -351,9 +351,9 @@ static int cg(void)
 		i++;
 	}
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
 	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
 	return 0;

+ 11 - 11
examples/heat/dw_factolu.c

@@ -30,12 +30,12 @@ struct starpu_perfmodel model_12;
 struct starpu_perfmodel model_21;
 struct starpu_perfmodel model_22;
 
-unsigned *advance_11; /* size nblocks, whether the 11 task is done */
-unsigned *advance_12_21; /* size nblocks*nblocks */
-unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
+static unsigned *advance_11; /* size nblocks, whether the 11 task is done */
+static unsigned *advance_12_21; /* size nblocks*nblocks */
+static unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
 
-struct timeval start;
-struct timeval end;
+static double start;
+static double end;
 
 static unsigned no_prio = 0;
 
@@ -618,7 +618,7 @@ void dw_codelet_facto(starpu_data_handle_t dataA, unsigned nblocks)
 	args->nblocks = nblocks;
 	args->dataA = dataA;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/* inject a new task with this codelet into the system */ 
 	struct starpu_task *task = starpu_task_create();
@@ -635,9 +635,9 @@ void dw_codelet_facto(starpu_data_handle_t dataA, unsigned nblocks)
 
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
@@ -664,7 +664,7 @@ void dw_codelet_facto_v2(starpu_data_handle_t dataA, unsigned nblocks)
 	args->nblocks = nblocks;
 	args->dataA = dataA;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/* inject a new task with this codelet into the system */ 
 	struct starpu_task *task = starpu_task_create();
@@ -685,9 +685,9 @@ void dw_codelet_facto_v2(starpu_data_handle_t dataA, unsigned nblocks)
 
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 6 - 6
examples/heat/dw_factolu_grain.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -345,18 +345,18 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 	memcpy(Asaved, matA, ld*ld*sizeof(float));
 #endif
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/* that's only ok for powers of 2 yet ! */
 	dw_factoLU_grain_inner(matA, size, (size/nblocks) * nbigblocks, ld, size/nblocks, 0);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 6 - 6
examples/heat/dw_factolu_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -222,8 +222,8 @@ static void dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 {
 	int ret;
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	struct starpu_task *entry_task = NULL;
 
@@ -261,7 +261,7 @@ static void dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	}
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
@@ -274,9 +274,9 @@ static void dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	printf("%2.2f\n", timing/1000);
 

+ 5 - 6
examples/incrementer/incrementer.c

@@ -80,10 +80,10 @@ int main(int argc, char **argv)
 		.name = "increment"
 	};
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	unsigned i;
 	for (i = 0; i < niter; i++)
@@ -109,7 +109,7 @@ int main(int argc, char **argv)
 	/* update the array in RAM */
 	starpu_data_unregister(float_array_handle);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
                 float_array[1], float_array[2], float_array[3]);
@@ -120,8 +120,7 @@ int main(int argc, char **argv)
 		ret = 1;
 	}
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
-					(end.tv_usec - start.tv_usec));
+	double timing = end - start;
 
 	FPRINTF(stderr, "%u elems took %f ms\n", niter, timing/1000);
 

+ 6 - 6
examples/lu/lu_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -346,16 +346,16 @@ int main(int argc, char **argv)
 		}
 		else
 		{
-			struct timeval start;
-			struct timeval end;
+			double start;
+			double end;
 
-			gettimeofday(&start, NULL);
+			start = starpu_timing_now();
 
 			ret = STARPU_LU(lu_decomposition_pivot)(A, ipiv, size, size, nblocks);
 
-			gettimeofday(&end, NULL);
+			end = starpu_timing_now();
 
-			double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+			double timing = end - start;
 
 			unsigned n = size;
 			double flop = (2.0f*n*n*n)/3.0f;

+ 5 - 5
examples/lu/xlu.c

@@ -170,8 +170,8 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 {
 	int ret;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	struct starpu_task *entry_task = NULL;
 
@@ -213,7 +213,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	}
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	ret = starpu_task_submit(entry_task);
 	if (ret == -ENODEV) return ret;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -221,9 +221,9 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 5 - 5
examples/lu/xlu_implicit.c

@@ -110,14 +110,14 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 
 static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	int ret;
 
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -142,9 +142,9 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	/* stall the application until the end of computations */
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 7 - 6
examples/lu/xlu_implicit_pivot.c

@@ -155,14 +155,15 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 				  starpu_data_handle_t (* get_block)(starpu_data_handle_t *, unsigned, unsigned, unsigned),
 				  double *timing)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	int ret;
 
-	gettimeofday(&start, NULL);
-
 	/* create all the DAG nodes */
 	unsigned i,j,k;
+
+	start = starpu_timing_now();
+
 	for (k = 0; k < nblocks; k++)
 	{
 	     ret = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
@@ -196,9 +197,9 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 	/* stall the application until the end of computations */
 	starpu_task_wait_for_all();
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	*timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	*timing = end - start;
 	return 0;
 }
 

+ 5 - 5
examples/lu/xlu_pivot.c

@@ -232,8 +232,8 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 {
 	int ret;
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	struct starpu_task *entry_task = NULL;
 
@@ -298,7 +298,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 	}
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	ret = starpu_task_submit(entry_task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
@@ -307,9 +307,9 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 /*	starpu_task_wait_for_all(); */
 	free(tags);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	*timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	*timing = end - start;
 	return 0;
 }
 

+ 5 - 5
examples/mandelbrot/mandelbrot.c

@@ -506,10 +506,10 @@ int main(int argc, char **argv)
 
 	unsigned iter = 0;
 
-	struct timeval start, end;
+	double start, end;
 
 	if (demo)
-		gettimeofday(&start, NULL);
+		start = starpu_timing_now();
 
 	while (niter-- != 0)
 	{
@@ -573,15 +573,15 @@ int main(int argc, char **argv)
 				topY = -49.35016705749115;
 				bottomY = 49.64891691946615;
 
-				gettimeofday(&end, NULL);
-				double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+				end = starpu_timing_now();
+				double timing = end - start;
 
 				fprintf(stderr, "Time to generate %u frames : %f s\n", iter, timing/1000000.0);
 				fprintf(stderr, "Average FPS: %f\n", ((double)iter*1e+6)/timing);
 
 				/* Reset counters */
 				iter = 0;
-				gettimeofday(&start, NULL);
+				start = starpu_timing_now();
 			}
 			else
 			{

+ 5 - 2
examples/pi/SobolQRNG/sobol_gold.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,7 +55,6 @@
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
-#include <strings.h>
 
 #include "sobol.h"
 #include "sobol_gold.h"
@@ -63,6 +62,10 @@
 
 #define k_2powneg32 2.3283064E-10F
 
+#if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__)
+#define ffs(arg) _bit_scan_forward(arg)
+#endif
+
 /* Create the direction numbers, based on the primitive polynomials. */
 void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
 {

+ 1 - 1
examples/pi/SobolQRNG/sobol_primitives.c

@@ -66,7 +66,7 @@
 const struct primitive sobol_primitives[] =
 {
     /* First dimension is a special case so this entry is actually ignored */
-    {1, 0, 0, {}},
+    {1, 0, 0, 0},
     {2, 1, 0, {1}},
     {3, 2, 1, {1, 3}},
     {4, 3, 1, {1, 3, 1}},

+ 6 - 6
examples/pi/pi.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -156,10 +156,10 @@ int main(int argc, char **argv)
 	
 	starpu_data_partition(cnt_array_handle, &f);
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (i = 0; i < ntasks; i++)
 	{
@@ -188,9 +188,9 @@ int main(int argc, char **argv)
 	for (i = 0; i < ntasks; i++)
 		total_cnt += cnt_array[i];
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 
 	unsigned long total_shot_cnt = ntasks * nshot_per_task;
 

+ 5 - 5
examples/pi/pi_redux.c

@@ -340,8 +340,8 @@ int main(int argc, char **argv)
 	starpu_data_set_reduction_methods(shot_cnt_handle,
 					&redux_codelet, &init_codelet);
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	for (i = 0; i < ntasks_warmup; i++)
 	{
@@ -357,7 +357,7 @@ int main(int argc, char **argv)
 	}
 
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (i = 0; i < ntasks; i++)
 	{
@@ -375,8 +375,8 @@ int main(int argc, char **argv)
 	starpu_data_unregister(shot_cnt_handle);
 	starpu_data_unregister(xy_scratchpad_handle);
 
-	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	end = starpu_timing_now();
+	double timing = end - start;
 	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
 	 * probability to impact the disk: pi/4 */
 	unsigned long total = (ntasks + ntasks_warmup)*nshot_per_task;

+ 10 - 10
examples/ppm_downscaler/yuv_downscaler.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -27,13 +27,13 @@
 
 #include "yuv_downscaler.h"
 
-struct timeval start;
-struct timeval end;
+static double start;
+static double end;
 
-const char *filename_in_default = "hugefile.2s.yuv";
-const char *filename_out_default = "hugefile.2s.out.yuv";
-char filename_in[1024];
-char filename_out[1024];
+static const char *filename_in_default = "hugefile.2s.yuv";
+static const char *filename_out_default = "hugefile.2s.out.yuv";
+static char filename_in[1024];
+static char filename_out[1024];
 
 void parse_args(int argc, char **argv)
 {
@@ -206,7 +206,7 @@ int main(int argc, char **argv)
 	unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes;
 
 	fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes);
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/* do the computation */
 	for (frame = 0; frame < nframes; frame++)
@@ -275,9 +275,9 @@ int main(int argc, char **argv)
 	/* There is an implicit barrier: the unregister methods will block
 	 * until the computation is done and that the result was put back into
 	 * memory. */
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	fprintf(stderr, "Computation took %f seconds\n", timing/1000000);
 	fprintf(stderr, "FPS %f\n", (1000000*nframes)/timing);
 

+ 2 - 2
examples/sched_ctx/sched_ctx_without_sched_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,7 +18,7 @@
 #include <starpu.h>
 #include <omp.h>
 
-#ifdef STARPU_QUICK_CHECK
+#ifndef STARPU_QUICK_CHECK
 #define NTASKS 64
 #else
 #define NTASKS 10

+ 18 - 18
examples/spmv/dw_block_spmv.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,25 +21,25 @@
 #include "matrix_market/mm_to_bcsr.h"
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
-struct timeval start;
-struct timeval end;
+static double start;
+static double end;
 
-sem_t sem;
+static sem_t sem;
 
-unsigned c = 256;
-unsigned r = 256;
+static unsigned c = 256;
+static unsigned r = 256;
 
-unsigned remainingtasks = -1;
+static unsigned remainingtasks = -1;
 
-starpu_data_handle_t sparse_matrix;
-starpu_data_handle_t vector_in, vector_out;
+static starpu_data_handle_t sparse_matrix;
+static starpu_data_handle_t vector_in, vector_out;
 
-uint32_t size;
-char *inputfile;
-bcsr_t *bcsr_matrix;
+static uint32_t size;
+static char *inputfile;
+static bcsr_t *bcsr_matrix;
 
-float *vector_in_ptr;
-float *vector_out_ptr;
+static float *vector_in_ptr;
+static float *vector_out_ptr;
 
 void create_data(void)
 {
@@ -96,7 +96,7 @@ void init_problem_callback(void *arg)
 	if ( val == 0 )
 	{
 		printf("DONE ...\n");
-		gettimeofday(&end, NULL);
+		end = starpu_timing_now();
 
 /*		starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM); */
 		starpu_data_unpartition(vector_out, STARPU_MAIN_RAM);
@@ -181,7 +181,7 @@ void launch_spmv_codelets(void)
 	uint32_t *rowptr = starpu_bcsr_get_local_rowptr(sparse_matrix);
 	uint32_t *colind = starpu_bcsr_get_local_colind(sparse_matrix);
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	unsigned loop;
 	for (loop = 0; loop < NSPMV; loop++)
@@ -318,7 +318,7 @@ int main(STARPU_ATTRIBUTE_UNUSED int argc,
 
 	double totalflop = 2.0*c*r*totaltasks;
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 	FPRINTF(stderr, "Flop %e\n", totalflop);

+ 4 - 4
examples/spmv/spmv.c

@@ -115,7 +115,7 @@ int main(int argc, char **argv)
 	int ret;
 	unsigned part;
 	double timing;
-	struct timeval start, end;
+	double start, end;
 	unsigned row, pos;
 	unsigned ind;
 
@@ -213,7 +213,7 @@ int main(int argc, char **argv)
 	compile_spmv_opencl_kernel();
 #endif
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	/*
 	 *	Create and submit StarPU tasks
@@ -236,7 +236,7 @@ int main(int argc, char **argv)
 	}
 
 	starpu_task_wait_for_all();
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 	/*
 	 *	Unregister the CSR matrix and the output vector
@@ -270,7 +270,7 @@ int main(int argc, char **argv)
 	 */
 	starpu_shutdown();
 
-	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 

+ 2 - 0
include/pthread_win32/pthread.h

@@ -254,7 +254,9 @@ typedef pthread_mutex_t pthread_rwlock_t;
 typedef int pthread_rwlockattr_t;
 #define pthread_rwlock_init(lock, attr) pthread_mutex_init(lock, NULL)
 #define pthread_rwlock_wrlock(lock) pthread_mutex_lock(lock)
+#define pthread_rwlock_trywrlock(lock) pthread_mutex_trylock(lock)
 #define pthread_rwlock_rdlock(lock) pthread_mutex_lock(lock)
+#define pthread_rwlock_tryrdlock(lock) pthread_mutex_trylock(lock)
 #define pthread_rwlock_unlock(lock) pthread_mutex_unlock(lock)
 #define pthread_rwlock_destroy(lock) pthread_mutex_destroy(lock)
 

+ 0 - 8
include/starpu_config.h.in

@@ -109,14 +109,6 @@ typedef ssize_t starpu_ssize_t;
 #  define __starpu_inline __inline__
 #endif
 
-#ifdef _MSC_VER
-struct timespec
-{
-  time_t  tv_sec;  /* Seconds */
-  long    tv_nsec; /* Nanoseconds */
-};
-#endif
-
 #undef STARPU_QUICK_CHECK
 #undef STARPU_USE_DRAND48
 #undef STARPU_USE_ERAND48_R

+ 2 - 0
include/starpu_data_interfaces.h

@@ -37,6 +37,8 @@ extern "C"
 
 struct starpu_data_copy_methods
 {
+	int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);

+ 4 - 1
include/starpu_profiling.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,6 +21,7 @@
 #include <starpu.h>
 #include <errno.h>
 #include <time.h>
+#include <sys/time.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -87,6 +88,7 @@ int starpu_profiling_status_get(void);
 
 #ifdef BUILDING_STARPU
 #include <common/utils.h>
+#ifdef __GNUC__
 extern int _starpu_profiling;
 #define starpu_profiling_status_get() ({ \
 	int __ret; \
@@ -96,6 +98,7 @@ extern int _starpu_profiling;
 	__ret; \
 })
 #endif
+#endif
 
 int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info);
 

+ 2 - 2
include/starpu_task.h

@@ -166,12 +166,12 @@ struct starpu_task
 	unsigned destroy:1;
 	unsigned regenerate:1;
 
+	unsigned workerid;
+
 	unsigned scheduled:1;
 
 	unsigned int mf_skip:1;
 
-	unsigned workerid;
-
 	int priority;
 
 	enum starpu_task_status status;

+ 7 - 7
include/starpu_thread.h

@@ -28,7 +28,7 @@ extern "C"
 #ifdef STARPU_SIMGRID
 #include <xbt/synchro_core.h>
 #include <msg/msg.h>
-#elif !defined(_MSC_VER)
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU)
 #include <pthread.h>
 #endif
 #include <stdint.h>
@@ -50,7 +50,7 @@ int starpu_pthread_attr_init(starpu_pthread_attr_t *attr);
 int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr);
 int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate);
 
-#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* STARPU_SIMGRID */
 
 typedef pthread_t starpu_pthread_t;
 typedef pthread_attr_t starpu_pthread_attr_t;
@@ -85,7 +85,7 @@ int starpu_pthread_mutexattr_settype(starpu_pthread_mutexattr_t *attr, int type)
 int starpu_pthread_mutexattr_destroy(starpu_pthread_mutexattr_t *attr);
 int starpu_pthread_mutexattr_init(starpu_pthread_mutexattr_t *attr);
 
-#elif !defined(_MSC_VER) /* !STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
 
 typedef pthread_mutex_t starpu_pthread_mutex_t;
 typedef pthread_mutexattr_t starpu_pthread_mutexattr_t;
@@ -116,7 +116,7 @@ int starpu_pthread_key_delete(starpu_pthread_key_t key);
 int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer);
 void *starpu_pthread_getspecific(starpu_pthread_key_t key);
 
-#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
 
 typedef pthread_key_t starpu_pthread_key_t;
 
@@ -144,7 +144,7 @@ int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t
 int starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime);
 int starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond);
 
-#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
 
 typedef pthread_cond_t starpu_pthread_cond_t;
 typedef pthread_condattr_t starpu_pthread_condattr_t;
@@ -178,7 +178,7 @@ int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
-#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
 
 typedef pthread_rwlock_t starpu_pthread_rwlock_t;
 typedef pthread_rwlockattr_t starpu_pthread_rwlockattr_t;
@@ -270,7 +270,7 @@ typedef pthread_spinlock_t starpu_pthread_spinlock_t;
  * Other needed pthread definitions
  */
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(BUILDING_STARPU)
 typedef void* starpu_pthread_rwlock_t;
 typedef void* starpu_pthread_mutex_t;
 typedef void* starpu_pthread_cond_t;

+ 5 - 5
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -67,8 +67,8 @@ static struct starpu_codelet cl22 =
  */
 void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	starpu_data_handle_t **data_handles;
 	unsigned x,y,i,j,k;
 
@@ -104,7 +104,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	}
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -161,11 +161,11 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	free(data_handles);
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 	if (rank == 0)
 	{
-		*timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+		*timing = end - start;
 		*flops = (1.0f*size*size*size)/3.0f;
 	}
 }

+ 2 - 2
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -128,7 +128,7 @@ static void create_matrix()
 
 	filename = malloc(filename_length);
 
-	allocated_memory += nblocks*nblocks*blocksize*sizeof(TYPE *);
+	allocated_memory += nblocks*nblocks*blocksize;
 
 	/* Create the whole matrix on the disk */
 	unsigned i,j;

+ 6 - 6
mpi/examples/mpi_lu/pxlu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -821,8 +821,8 @@ static void wait_termination(void)
 
 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	nblocks = _nblocks;
 	rank = _rank;
@@ -854,15 +854,15 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
 
 	wait_termination();
 	
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	
 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
 	

+ 5 - 5
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -115,8 +115,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	nblocks = _nblocks;
 	rank = _rank;
@@ -127,7 +127,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -160,9 +160,9 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	
 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
 	

+ 4 - 4
mpi/src/starpu_mpi_task_insert.c

@@ -422,11 +422,11 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 		}
 		else if (arg_type==STARPU_CALLBACK)
 		{
-			va_arg(varg_list_copy, void (*)(void *));
+			va_arg(varg_list_copy, _starpu_callback_func_t);
 		}
 		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
 		{
-			va_arg(varg_list_copy, void (*)(void *));
+			va_arg(varg_list_copy, _starpu_callback_func_t);
 			va_arg(varg_list_copy, void *);
 		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
@@ -566,11 +566,11 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, struct starpu_codelet *codelet,
 		}
 		else if (arg_type==STARPU_CALLBACK)
 		{
-			va_arg(varg_list_copy, void (*)(void *));
+			va_arg(varg_list_copy, _starpu_callback_func_t);
 		}
 		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
 		{
-			va_arg(varg_list_copy, void (*)(void *));
+			va_arg(varg_list_copy, _starpu_callback_func_t);
 			va_arg(varg_list_copy, void *);
 		}
 		else if (arg_type==STARPU_CALLBACK_ARG)

+ 5 - 1
src/common/fxt.c

@@ -52,6 +52,10 @@ uint64_t fut_getstamp(void)
 
 long _starpu_gettid(void)
 {
+	/* TODO: test at configure whether __thread is available, and use that
+	 * to cache the value.
+	 * Don't use the TSD, this is getting called before we would have the
+	 * time to allocate it.  */
 #ifdef STARPU_SIMGRID
 	return (uintptr_t) MSG_process_self();
 #else
@@ -61,7 +65,7 @@ long _starpu_gettid(void)
 	long tid;
 	thr_self(&tid);
 	return tid;
-#elif defined(__MINGW32__)
+#elif defined(_WIN32) && !defined(__CYGWIN__)
 	return (long) GetCurrentThreadId();
 #else
 	return (long) pthread_self();

+ 2 - 2
src/common/fxt.h

@@ -410,8 +410,8 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_START(workerkind, workerid, devid, memnode)	\
 	FUT_DO_PROBE5(_STARPU_FUT_WORKER_INIT_START, workerkind, workerid, devid, memnode, _starpu_gettid());
 
-#define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
-	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
+#define _STARPU_TRACE_WORKER_INIT_END(__workerid)				\
+	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (__workerid));
 
 #define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)				\
 do {									\

+ 6 - 2
src/common/thread.c

@@ -131,13 +131,17 @@ int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
 
 int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
 {
+	int ret;
 	_STARPU_TRACE_TRYLOCK_MUTEX();
 
-	xbt_mutex_acquire(*mutex);
+	/* TODO: use what simgrid will provide some day */
+	/* xbt_mutex_try_acquire(*mutex); */
+	ret = simcall_mutex_trylock((smx_mutex_t)*mutex);
+	ret = ret ? 0 : -EBUSY;
 
 	_STARPU_TRACE_MUTEX_LOCKED();
 
-	return 0;
+	return ret;
 }
 
 int starpu_pthread_mutexattr_gettype(const starpu_pthread_mutexattr_t *attr, int *type)

+ 3 - 2
src/common/timing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -143,10 +143,11 @@ void _starpu_timing_init(void)
 
 	{
 		struct timeval tv1,tv2;
+		struct timespec ts = { .tv_sec = 0, .tv_nsec = 500000000UL };
 
 		STARPU_GET_TICK(t1);
 		gettimeofday(&tv1,0);
-		usleep(500000);
+		_starpu_sleep(&ts);
 		STARPU_GET_TICK(t2);
 		gettimeofday(&tv2,0);
 		_starpu_scale = ((tv2.tv_sec*1e6 + tv2.tv_usec) -

+ 88 - 3
src/common/utils.c

@@ -18,15 +18,34 @@
 #include <starpu.h>
 #include <common/config.h>
 #include <common/utils.h>
-#include <libgen.h>
 #include <errno.h>
 #include <unistd.h>
+#include <fcntl.h>
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <io.h>
+#include <sys/locking.h>
 #define mkdir(path, mode) mkdir(path)
 #endif
 
+#if defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__)
+#include <direct.h>
+static char * dirname(char * path)
+{
+   char drive[_MAX_DRIVE];
+   char dir[_MAX_DIR];
+   /* Remove trailing slash */
+   while (strlen(path) > 0 && (*(path+strlen(path)-1) == '/' || *(path+strlen(p
+th)-1) == '\\'))
+      *(path+strlen(path)-1) = '\0';
+   _splitpath(path, drive, dir, NULL, NULL);
+   _makepath(path, drive, dir, NULL, NULL);
+   return path;
+}
+#else
+#include <libgen.h>
+#endif
+
 /* Function with behaviour like `mkdir -p'. This function was adapted from
  * http://niallohiggins.com/2009/01/08/mkpath-mkdir-p-alike-in-c-for-unix/ */
 
@@ -38,7 +57,7 @@ int _starpu_mkpath(const char *s, mode_t mode)
 
 	rv = -1;
 	if (strcmp(s, ".") == 0 || strcmp(s, "/") == 0
-#ifdef __MINGW32__
+#if defined(_WIN32)
 		/* C:/ or C:\ */
 		|| (s[0] && s[1] == ':' && (s[2] == '/' || s[2] == '\\') && !s[3])
 #endif
@@ -102,6 +121,72 @@ void _starpu_mkpath_and_check(const char *path, mode_t mode)
 	}
 }
 
+int _starpu_ftruncate(FILE *file)
+{
+	return ftruncate(fileno(file), 0);
+}
+
+int _starpu_frdlock(FILE *file)
+{
+#if defined(_WIN32) && !defined(__CYGWIN__)
+	int ret;
+	do {
+		ret = _locking(fileno(file), _LK_RLCK, 10);
+	} while (ret == EDEADLOCK);
+	return ret;
+#else
+	struct flock lock = {
+		.l_type = F_RDLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+#endif
+}
+
+int _starpu_frdunlock(FILE *file)
+{
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#  ifndef _LK_UNLCK
+#    define _LK_UNLCK _LK_UNLOCK
+#  endif
+	return _locking(fileno(file), _LK_UNLCK, 10);
+#else
+	struct flock lock = {
+		.l_type = F_UNLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+#endif
+}
+
+int _starpu_fwrlock(FILE *file)
+{
+#if defined(_WIN32) && !defined(__CYGWIN__)
+	int ret;
+	do {
+		ret = _locking(fileno(file), _LK_LOCK, 10);
+	} while (ret == EDEADLOCK);
+	return ret;
+#else
+	struct flock lock = {
+		.l_type = F_WRLCK,
+		.l_whence = SEEK_SET,
+		.l_start = 0,
+		.l_len = 0
+	};
+	return fcntl(fileno(file), F_SETLKW, &lock);
+#endif
+}
+
+int _starpu_fwrunlock(FILE *file)
+{
+	return _starpu_frdunlock(file);
+}
+
 int _starpu_check_mutex_deadlock(starpu_pthread_mutex_t *mutex)
 {
 	int ret;

+ 9 - 0
src/common/utils.h

@@ -101,10 +101,19 @@
 	} while (0)
 
 
+#ifdef _MSC_VER
+#define _STARPU_IS_ZERO(a) (a == 0.0)
+#else
 #define _STARPU_IS_ZERO(a) (fpclassify(a) == FP_ZERO)
+#endif
 
 int _starpu_mkpath(const char *s, mode_t mode);
 void _starpu_mkpath_and_check(const char *s, mode_t mode);
+int _starpu_ftruncate(FILE *file);
+int _starpu_frdlock(FILE *file);
+int _starpu_frdunlock(FILE *file);
+int _starpu_fwrlock(FILE *file);
+int _starpu_fwrunlock(FILE *file);
 char *_starpu_get_home_path(void);
 void _starpu_gethostname(char *hostname, size_t size);
 

+ 1 - 1
src/core/combined_workers.c

@@ -25,7 +25,7 @@
 #include <sched.h>
 #endif
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 

+ 26 - 2
src/core/perfmodel/perfmodel_bus.c

@@ -783,6 +783,8 @@ static void load_bus_affinity_file_content(void)
 	f = fopen(path, "r");
 	STARPU_ASSERT(f);
 
+	_starpu_frdlock(f);
+
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	ncpus = _starpu_topology_get_nhwcpu(config);
         unsigned gpu;
@@ -835,6 +837,7 @@ static void load_bus_affinity_file_content(void)
 		STARPU_ASSERT(ret == 0);
 	}
 #endif /* !STARPU_USE_OPENCL */
+	_starpu_frdunlock(f);
 
 	fclose(f);
 #endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
@@ -862,6 +865,7 @@ static void write_bus_affinity_file_content(void)
 		STARPU_ABORT();
 	}
 
+	_starpu_frdlock(f);
 	unsigned cpu;
         unsigned gpu;
 
@@ -897,6 +901,7 @@ static void write_bus_affinity_file_content(void)
 	}
 #endif
 
+	_starpu_frdunlock(f);
 	fclose(f);
 #endif
 }
@@ -1006,6 +1011,7 @@ static int load_bus_latency_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_frdlock(f);
 
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
@@ -1073,13 +1079,14 @@ static int load_bus_latency_file_content(void)
 			break;
 		ungetc(n, f);
 	}
+	_starpu_frdunlock(f);
+	fclose(f);
 
 	/* No more values, take NAN */
 	for ( ; src < STARPU_MAXNODES; src++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 			latency_matrix[src][dst] = NAN;
 
-	fclose(f);
 	return 1;
 }
 
@@ -1104,6 +1111,8 @@ static void write_bus_latency_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
 	fprintf(f, "# ");
 	for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1163,6 +1172,7 @@ static void write_bus_latency_file_content(void)
 
 		fprintf(f, "\n");
 	}
+	_starpu_fwrunlock(f);
 
 	fclose(f);
 }
@@ -1223,6 +1233,7 @@ static int load_bus_bandwidth_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_frdlock(f);
 
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
@@ -1290,13 +1301,14 @@ static int load_bus_bandwidth_file_content(void)
 			break;
 		ungetc(n, f);
 	}
+	_starpu_frdunlock(f);
+	fclose(f);
 
 	/* No more values, take NAN */
 	for ( ; src < STARPU_MAXNODES; src++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 			latency_matrix[src][dst] = NAN;
 
-	fclose(f);
 	return 1;
 }
 
@@ -1316,6 +1328,9 @@ static void write_bus_bandwidth_file_content(void)
 	f = fopen(path, "w+");
 	STARPU_ASSERT(f);
 
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
+
 	fprintf(f, "# ");
 	for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		fprintf(f, "to %d\t\t", dst);
@@ -1387,6 +1402,7 @@ static void write_bus_bandwidth_file_content(void)
 		fprintf(f, "\n");
 	}
 
+	_starpu_fwrunlock(f);
 	fclose(f);
 }
 #endif /* STARPU_SIMGRID */
@@ -1551,6 +1567,7 @@ static void check_bus_config_file(void)
                 // Loading configuration from file
                 f = fopen(path, "r");
                 STARPU_ASSERT(f);
+		_starpu_frdlock(f);
                 _starpu_drop_comments(f);
                 ret = fscanf(f, "%u\t", &read_cpus);
 		STARPU_ASSERT(ret == 1);
@@ -1565,6 +1582,7 @@ static void check_bus_config_file(void)
 		if (ret == 0)
 			read_mic = 0;
                 _starpu_drop_comments(f);
+		_starpu_frdunlock(f);
                 fclose(f);
 
                 // Loading current configuration
@@ -1619,6 +1637,8 @@ static void write_bus_config_file_content(void)
 
         f = fopen(path, "w+");
 	STARPU_ASSERT(f);
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
         fprintf(f, "# Current configuration\n");
         fprintf(f, "%u # Number of CPUs\n", ncpus);
@@ -1626,6 +1646,7 @@ static void write_bus_config_file_content(void)
         fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
         fprintf(f, "%d # Number of MIC devices\n", nmic);
 
+	_starpu_fwrunlock(f);
         fclose(f);
 }
 
@@ -1664,6 +1685,8 @@ static void write_bus_platform_file_content(void)
 		fflush(stderr);
 		STARPU_ABORT();
 	}
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 
 	fprintf(f,
 "<?xml version='1.0'?>\n"
@@ -1810,6 +1833,7 @@ static void write_bus_platform_file_content(void)
 " </platform>\n"
 		);
 
+	_starpu_fwrunlock(f);
 	fclose(f);
 }
 

+ 19 - 3
src/core/perfmodel/perfmodel_history.c

@@ -16,9 +16,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <dirent.h>
 #include <unistd.h>
+#if !defined(_WIN32) || defined(__MINGW__) || defined(__CYGWIN__)
+#include <dirent.h>
 #include <sys/stat.h>
+#endif
 #include <errno.h>
 #include <common/config.h>
 #include <common/utils.h>
@@ -653,10 +655,10 @@ static void initialize_model_with_file(FILE*f, struct starpu_perfmodel *model)
 
 void starpu_perfmodel_init(struct starpu_perfmodel *model)
 {
-	STARPU_ASSERT(model && model->symbol);
-
 	int already_init;
 
+	STARPU_ASSERT(model);
+
 	STARPU_PTHREAD_RWLOCK_RDLOCK(&registered_models_rwlock);
 	already_init = model->is_init;
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
@@ -834,7 +836,10 @@ static void save_history_based_model(struct starpu_perfmodel *model)
 	f = fopen(path, "w+");
 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
 
+	_starpu_fwrlock(f);
+	_starpu_ftruncate(f);
 	dump_model_file(f, model);
+	_starpu_fwrunlock(f);
 
 	fclose(f);
 }
@@ -1009,7 +1014,9 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 				f = fopen(path, "r");
 				STARPU_ASSERT(f);
 
+				_starpu_frdlock(f);
 				parse_model_file(f, model, scan_history);
+				_starpu_frdunlock(f);
 
 				fclose(f);
 			}
@@ -1038,6 +1045,7 @@ void starpu_perfmodel_directory(FILE *output)
  * the performance model files */
 int starpu_perfmodel_list(FILE *output)
 {
+#if !defined(_WIN32) || defined(__MINGW__) || defined(__CYGWIN__)
         char path[256];
         DIR *dp;
         struct dirent *ep;
@@ -1061,6 +1069,10 @@ int starpu_perfmodel_list(FILE *output)
 		_STARPU_DISP("Could not open the perfmodel directory <%s>: %s\n", path, strerror(errno));
         }
 	return 0;
+#else
+	fprintf(stderr,"Listing perfmodels is not implemented on pure Windows yet\n");
+	return 1;
+#endif
 }
 
 /* This function is intended to be used by external tools that should read the
@@ -1099,10 +1111,12 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 	FILE *f = fopen(path, "r");
 	STARPU_ASSERT(f);
 
+	_starpu_frdlock(f);
 	starpu_perfmodel_init_with_file(f, model);
 	rewind(f);
 
 	parse_model_file(f, model, 1);
+	_starpu_frdunlock(f);
 
 	STARPU_ASSERT(fclose(f) == 0);
 
@@ -1412,6 +1426,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			_STARPU_DISP("Error <%s> when opening file <%s>\n", strerror(errno), per_arch_model->debug_path);
 			STARPU_ABORT();
 		}
+		_starpu_fwrlock(f);
 
 		if (!j->footprint_is_computed)
 			(void) _starpu_compute_buffers_footprint(model, arch, nimpl, j);
@@ -1431,6 +1446,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			handle->ops->display(handle, f);
 		}
 		fprintf(f, "\n");
+		_starpu_fwrunlock(f);
 		fclose(f);
 #endif
 		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);

+ 8 - 2
src/core/perfmodel/perfmodel_nan.c

@@ -21,7 +21,7 @@
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
-#include <config.h>
+#include <common/config.h>
 #include <core/perfmodel/perfmodel.h>
 #include <ctype.h>
 
@@ -54,8 +54,14 @@ int _starpu_read_double(FILE *f, char *format, double *val)
 	     int x3 = getc(f);
 	     if (x2 == 'a' && x3 == 'n')
 	     {
+#ifdef _MSC_VER
+		     unsigned long long _mynan = 0x7fffffffffffffffull;
+		     double mynan = *(double*)&_mynan;
+#else
+		     double mynan = NAN;
+#endif
 		     _starpu_read_spaces(f);
-		     *val = NAN;
+		     *val = mynan;
 		     return 1;
 	     }
 	     else

+ 2 - 2
src/core/sched_ctx.c

@@ -679,10 +679,10 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 		_starpu_delete_sched_ctx(sched_ctx);
 
 	}
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
 	/* workerids is malloc-ed in starpu_sched_ctx_get_workers_list, don't forget to free it when
 	   you don't use it anymore */
 	free(workerids);
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
 	_starpu_relock_mutex_if_prev_locked();
 	return;
 }
@@ -1613,7 +1613,7 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 		STARPU_ABORT();
 	}
 
-#elif defined(__MINGW32__) || defined(__CYGWIN__)
+#elif defined(_WIN32)
 	DWORD mask = 1 << cpuid;
 	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
 	{

+ 3 - 0
src/core/simgrid.c

@@ -217,6 +217,7 @@ void _starpu_simgrid_init()
 		/* Get XML platform */
 		_starpu_simgrid_get_platform_path(path, sizeof(path));
 		in = fopen(path, "r");
+		_starpu_frdlock(in);
 		STARPU_ASSERT_MSG(in, "Could not open platform file %s", path);
 #ifdef HAVE_MKSTEMPS
 		out = mkstemps(template, strlen(".xml"));
@@ -230,6 +231,8 @@ void _starpu_simgrid_init()
 		snprintf(cmdline, sizeof(cmdline), "xsltproc --novalid --stringparam ASname %s -o %s "STARPU_DATADIR"/starpu/starpu_smpi.xslt %s", asname, template, path);
 		ret = system(cmdline);
 		STARPU_ASSERT_MSG(ret == 0, "running xsltproc to generate SMPI platforms %s from %s failed", template, path);
+		_starpu_frdunlock(in);
+		fclose(in);
 
 		/* And create it */
 		MSG_create_environment(template);

+ 4 - 0
src/core/task.c

@@ -1018,7 +1018,11 @@ static void *watchdog_func(void *foo STARPU_ATTRIBUTE_UNUSED)
 	if (! (timeout_env = getenv("STARPU_WATCHDOG_TIMEOUT")))
 		return NULL;
 
+#ifdef _MSC_VER
+	timeout = (unsigned long long) _atoi64(timeout_env);
+#else
 	timeout = atoll(timeout_env);
+#endif
 	ts.tv_sec = timeout / 1000000;
 	ts.tv_nsec = (timeout % 1000000) * 1000;
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();

+ 2 - 2
src/core/topology.c

@@ -461,7 +461,7 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 
 	config->topology.nhwcpus = config->topology.nhwpus = sysconf(_SC_NPROCESSORS_ONLN);
 
-#elif defined(__MINGW32__) || defined(__CYGWIN__)
+#elif defined(_WIN32)
 	/* Discover the CPUs on Cygwin and MinGW systems. */
 
 	SYSTEM_INFO sysinfo;
@@ -1148,7 +1148,7 @@ _starpu_bind_thread_on_cpu (
 		STARPU_ABORT();
 	}
 
-#elif defined(__MINGW32__) || defined(__CYGWIN__)
+#elif defined(_WIN32)
 	DWORD mask = 1 << cpuid;
 	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
 	{

+ 5 - 1
src/core/workers.c

@@ -42,7 +42,7 @@
 #include <core/simgrid.h>
 #endif
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 
@@ -428,6 +428,10 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
 	starpu_task_list_init(&workerarg->local_tasks);
 	workerarg->current_task = NULL;
+	workerarg->first_task = 0;
+	workerarg->ntasks = 0;
+	workerarg->pipeline_length = 0;
+	workerarg->pipeline_stuck = 0;
 	workerarg->set = NULL;
 
 	/* if some codelet's termination cannot be handled directly :

+ 8 - 1
src/core/workers.h

@@ -52,6 +52,8 @@
 
 #include <starpu_parameters.h>
 
+#define STARPU_MAX_PIPELINE 4
+
 /* This is initialized from in _starpu_worker_init */
 LIST_TYPE(_starpu_worker,
 	struct _starpu_machine_config *config;
@@ -73,7 +75,12 @@ LIST_TYPE(_starpu_worker,
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
-	struct starpu_task *current_task; /* task currently executed by this worker */
+	struct starpu_task *current_task; /* task currently executed by this worker (non-pipelined version) */
+	struct starpu_task *current_tasks[STARPU_MAX_PIPELINE]; /* tasks currently executed by this worker (pipelined version) */
+	unsigned char first_task; /* Index of first task in the pipeline */
+	unsigned char ntasks; /* number of tasks in the pipeline */
+	unsigned char pipeline_length; /* number of tasks to be put in the pipeline */
+	unsigned char pipeline_stuck; /* whether a task prevents us from pipelining */
 	struct _starpu_worker_set *set; /* in case this worker belongs to a set */
 	struct _starpu_job_list *terminated_jobs; /* list of pending jobs which were executed */
 	unsigned worker_is_running;

+ 33 - 2
src/datawizard/coherency.c

@@ -41,6 +41,8 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 	double cost = INFINITY;
 	unsigned src_node_mask = 0;
 
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
 	for (node = 0; node < nnodes; node++)
 	{
 		if (handle->per_node[node].state != STARPU_INVALID)
@@ -72,6 +74,15 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 				double time = starpu_transfer_predict(i, destination, size);
 				unsigned handling_node;
 
+				/* Avoid transfers which the interface does not want */
+				if (copy_methods->can_copy)
+				{
+					void *src_interface = handle->per_node[i].data_interface;
+					void *dst_interface = handle->per_node[destination].data_interface;
+					if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
+						continue;
+				}
+
 				/* Avoid indirect transfers */
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 					continue;
@@ -104,8 +115,28 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 		
 		if (src_node_mask & (1<<i))
 		{
+			/* Avoid transfers which the interface does not want */
+			if (copy_methods->can_copy)
+			{
+				void *src_interface = handle->per_node[i].data_interface;
+				void *dst_interface = handle->per_node[destination].data_interface;
+				unsigned handling_node;
+
+				if (!copy_methods->can_copy(src_interface, i, dst_interface, destination))
+					continue;
+
+				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
+				{
+					/* Avoid through RAM if the interface does not want it */
+					void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
+					if (!copy_methods->can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM)
+					 || !copy_methods->can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination))
+						continue;
+				}
+			}
+
 			/* however GPU are expensive sources, really !
-			 * 	Unless peer transfer is supported.
+			 * 	Unless peer transfer is supported (and it would then have been selected above).
 			 * 	Other should be ok */
 
 			if (starpu_node_get_kind(i) == STARPU_CUDA_RAM ||
@@ -222,7 +253,7 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 {
 	(void) handle; // unused
 
-	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
+	/* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
 	 * Perhaps not all data interface provide a direct GPU-GPU transfer
 	 * method ! */
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)

+ 4 - 1
src/datawizard/filters.c

@@ -437,12 +437,15 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 	}
 
 	/* there is no child anymore */
-	free(root_handle->children);
+	starpu_data_handle_t children = root_handle->children;
 	root_handle->children = NULL;
 	root_handle->nchildren = 0;
 
 	/* now the parent may be used again so we release the lock */
 	_starpu_spin_unlock(&root_handle->header_lock);
+
+	free(children);
+
 	_STARPU_TRACE_END_UNPARTITION(root_handle, gathering_node);
 }
 

+ 2 - 0
src/datawizard/interfaces/block_interface.c

@@ -439,6 +439,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	else
 	{
 		/* Default case: we transfer all blocks one by one: nz transfers */
+		/* TODO: use cudaMemcpy3D now that it works */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
@@ -509,6 +510,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 	else
 	{
 		/* Default case: we transfer all blocks one by one: nz 2D transfers */
+		/* TODO: use cudaMemcpy3D now that it works */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{

+ 4 - 2
src/datawizard/interfaces/data_interface.c

@@ -499,9 +499,10 @@ int _starpu_data_release_tag(starpu_data_handle_t handle)
 		STARPU_ASSERT_MSG((tag_entry != NULL),"Data handle %p with tag %d isn't in the hashmap !",handle,handle->tag);
 
 		HASH_DEL(registered_tag_handles, tag_entry);
-		free(tag_entry);
 
 		_starpu_spin_unlock(&registered_tag_handles_lock);
+
+		free(tag_entry);
 	}
 	return 0;
 }
@@ -530,9 +531,10 @@ void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 		STARPU_ASSERT(entry != NULL);
 
 		HASH_DEL(registered_handles, entry);
-		free(entry);
 
 		_starpu_spin_unlock(&registered_handles_lock);
+
+		free(entry);
 	}
 }
 

+ 7 - 119
src/datawizard/interfaces/matrix_interface.c

@@ -27,19 +27,14 @@
 #include <drivers/scc/driver_scc_source.h>
 #include <drivers/mic/driver_mic_source.h>
 
-/* If you can promise that there is no stride in your matrices, you can define this */
-// #define NO_STRIDE
-
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-#ifdef NO_STRIDE
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
 #endif
-#endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
@@ -58,17 +53,13 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_cuda = copy_cuda_to_cuda,
-#ifdef NO_STRIDE
 	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#endif
 #else
 #ifdef STARPU_SIMGRID
-#ifdef NO_STRIDE
 	/* Enable GPU-GPU transfers in simgrid */
 	.cuda_to_cuda_async = 1,
 #endif
 #endif
-#endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
 	.opencl_to_ram = copy_opencl_to_ram,
@@ -379,29 +370,6 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 	size_t elemsize = src_matrix->elemsize;
 	cudaError_t cures;
 
-#if 0
-	struct cudaMemcpy3DParms p;
-	memset(&p, 0, sizeof(p));
-
-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * elemsize, src_matrix->ny);
-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * elemsize, dst_matrix->ny);
-	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
-	p.kind = kind;
-
-	if (is_async)
-	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpy3DAsync(&p, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		if (!cures)
-			return -EAGAIN;
-	}
-
-	cures = cudaMemcpy3D(&p);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-#else
-
 	if (is_async)
 	{
 		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
@@ -422,17 +390,15 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 		if (ret == -EAGAIN) return ret;
 		if (ret) STARPU_CUDA_REPORT_ERROR(cures);
 	}
-#endif
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return 0;
 }
 
-/* XXX this is broken : We need to properly call cudaDeviceEnablePeerAccess(), and avoid crossing NUMA nodes... */
-#ifdef NO_STRIDE
 static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
 {
+#ifdef HAVE_CUDA_MEMCPY_PEER
 	struct starpu_matrix_interface *src_matrix = src_interface;
 	struct starpu_matrix_interface *dst_matrix = dst_interface;
 
@@ -442,70 +408,15 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	int src_dev = _starpu_memory_node_get_devid(src_node);
 	int dst_dev = _starpu_memory_node_get_devid(dst_node);
 
-
-#if 0
-	/* That code is not even working!! */
-	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
-
-	starpu_cuda_set_device(src_dev);
-
-	struct cudaPitchedPtr mem_device1;
-	cures = cudaMalloc3D(&mem_device1, extent);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	starpu_cuda_set_device(dst_dev);
-
-	struct cudaPitchedPtr mem_device2;
-	cures = cudaMalloc3D(&mem_device2, extent);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	struct cudaMemcpy3DPeerParms p;
-	memset(&p, 0, sizeof(p));
-	p.srcDevice = src_dev;
-	p.dstDevice = dst_dev;
-	p.srcPtr = mem_device1;
-	p.dstPtr = mem_device2;
-	p.extent = extent;
-
-	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
-	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
-	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
-	cures = cudaMemcpy3DPeer(&p);
-	if (STARPU_UNLIKELY(cures))
-	        STARPU_CUDA_REPORT_ERROR(cures);
-#endif
-
-#if 0
 	struct cudaMemcpy3DPeerParms p;
 	memset(&p, 0, sizeof(p));
 
 	p.srcDevice = src_dev;
 	p.dstDevice = dst_dev;
-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx * elemsize, src_matrix->ny);
-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx * elemsize, dst_matrix->ny);
+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx, dst_matrix->ny);
 	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
 
-#if 1
-	fprintf(stderr,"%u %u\n", p.srcDevice, p.dstDevice);
-	fprintf(stderr,"%p %p\n", p.srcArray, p.dstArray);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.srcPtr.ptr, p.srcPtr.pitch, p.srcPtr.xsize, p.srcPtr.ysize);
-	fprintf(stderr,"%p %lu %lu %lu\n", p.dstPtr.ptr, p.dstPtr.pitch, p.dstPtr.xsize, p.dstPtr.ysize);
-	fprintf(stderr,"%lu %lu %lu\n", p.srcPos.x, p.srcPos.y, p.srcPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.dstPos.x, p.dstPos.y, p.dstPos.z);
-	fprintf(stderr,"%lu %lu %lu\n", p.extent.width, p.extent.height, p.extent.depth);
-#endif
-
-	cures = cudaMemcpy3DPeerAsync(&p, stream);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-	cudaStreamSynchronize(stream);
-
 	if (is_async)
 	{
 		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
@@ -519,30 +430,13 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-#else
-	/* XXX FIXME !!*/
-	STARPU_ASSERT(src_matrix->nx == src_matrix->ld);
-	STARPU_ASSERT(dst_matrix->nx == dst_matrix->ld);
-
-	if (is_async)
-	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		if (!cures)
-			return -EAGAIN;
-	}
-
-	cures = cudaMemcpyPeer((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-#endif
-
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
 	return 0;
-}
+#else
+	STARPU_ABORT_MSG("CUDA memcpy peer not available, but core triggered one ?!");
 #endif
+}
 
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
@@ -559,11 +453,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRI
 	if (src_node == dst_node)
 		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
 	else
-	{
-		/* XXX not implemented */
-		STARPU_ABORT();
-		return 0;
-	}
+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 0, 0);
 }
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
@@ -576,7 +466,6 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
 }
 
-#ifdef NO_STRIDE
 static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
 {
 	if (src_node == dst_node)
@@ -584,7 +473,6 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU
 	else
 		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
 }
-#endif
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL

+ 2 - 1
src/datawizard/memory_nodes.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -132,6 +132,7 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 		prefix = "SCC_shared";
 		break;
 	case STARPU_UNUSED:
+	default:
 		prefix = "unknown";
 		STARPU_ASSERT(0);
 	}

+ 11 - 7
src/debug/traces/starpu_fxt.c

@@ -115,7 +115,9 @@ static double last_codelet_start[STARPU_NMAXWORKERS];
 static char last_codelet_symbol[STARPU_NMAXWORKERS][4*sizeof(unsigned long)];
 static int last_codelet_parameter[STARPU_NMAXWORKERS];
 #define MAX_PARAMETERS 8
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
 static char last_codelet_parameter_description[STARPU_NMAXWORKERS][MAX_PARAMETERS][FXT_MAX_PARAMS*sizeof(unsigned long)];
+#endif
 
 /* If more than a period of time has elapsed, we flush the profiling info,
  * otherwise they are accumulated everytime there is a new relevant event. */
@@ -321,6 +323,7 @@ static void thread_pop_state(double time, const char *prefix, long unsigned int
 #endif
 }
 
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
 static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag)
 {
 #ifdef STARPU_HAVE_POTI
@@ -332,6 +335,7 @@ static void worker_set_detailed_state(double time, const char *prefix, long unsi
 	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx\n", time, prefix, workerid, name, size, parameters, footprint, tag);
 #endif
 }
+#endif
 
 static void mpicommthread_set_state(double time, const char *prefix, const char *name)
 {
@@ -2223,7 +2227,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 	{
 		unsigned inputfile;
 
-		uint64_t offsets[64];
+		uint64_t offsets[options->ninputfiles];
 
 		/*
 		 * Find the trace offsets:
@@ -2236,11 +2240,11 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 		 *	- psi_k(x) = x - offset_k
 		 */
 
-		int unique_keys[64];
-		int rank_k[64];
-		uint64_t start_k[64];
-		uint64_t sync_k[64];
-		unsigned sync_k_exists[64];
+		int unique_keys[options->ninputfiles];
+		int rank_k[options->ninputfiles];
+		uint64_t start_k[options->ninputfiles];
+		uint64_t sync_k[options->ninputfiles];
+		unsigned sync_k_exists[options->ninputfiles];
 		uint64_t M = 0;
 
 		unsigned found_one_sync_point = 0;
@@ -2311,7 +2315,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 #endif
 
 			char file_prefix[32];
-			snprintf(file_prefix, 32, "%d_", filerank);
+			snprintf(file_prefix, sizeof(file_prefix), "%d_", filerank);
 
 			options->file_prefix = file_prefix;
 			options->file_offset = offsets[inputfile];

+ 158 - 96
src/drivers/cuda/driver_cuda.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -49,7 +49,7 @@ static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t in_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static cudaStream_t out_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
-static cudaEvent_t task_events[STARPU_NMAXWORKERS];
+static cudaEvent_t task_events[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
 #endif /* STARPU_USE_CUDA */
 
 void
@@ -221,7 +221,7 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 {
 	cudaError_t cures;
 	int workerid;
-	unsigned i;
+	unsigned i, j;
 
 	/* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
 
@@ -276,7 +276,8 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 	{
 		workerid = worker_set->workers[i].workerid;
 
-		cures = cudaEventCreateWithFlags(&task_events[workerid], cudaEventDisableTiming);
+		for (j = 0; j < STARPU_MAX_PIPELINE; j++)
+			cures = cudaEventCreateWithFlags(&task_events[workerid][j], cudaEventDisableTiming);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -307,7 +308,7 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 static void deinit_context(struct _starpu_worker_set *worker_set)
 {
 	cudaError_t cures;
-	unsigned i;
+	unsigned i, j;
 	int workerid = worker_set->workers[0].workerid;
 	int devid = starpu_worker_get_devid(workerid);
 
@@ -316,7 +317,8 @@ static void deinit_context(struct _starpu_worker_set *worker_set)
 		workerid = worker_set->workers[i].workerid;
 		devid = starpu_worker_get_devid(workerid);
 
-		cudaEventDestroy(task_events[workerid]);
+		for (j = 0; j < STARPU_MAX_PIPELINE; j++)
+			cudaEventDestroy(task_events[workerid][j]);
 		cudaStreamDestroy(streams[workerid]);
 	}
 
@@ -372,7 +374,7 @@ void _starpu_init_cuda(void)
 	STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
 }
 
-static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
+static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	int ret;
 
@@ -396,11 +398,15 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
+	if (worker->ntasks == 1)
+	{
+		/* We are alone in the pipeline, the kernel will start now, record it */
+		_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
+	}
 
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */
-	starpu_cuda_set_device(args->devid);
+	starpu_cuda_set_device(worker->devid);
 #endif
 
 	starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
@@ -410,7 +416,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 	{
 		_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
-		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
+		_starpu_simgrid_execute_job(j, &worker->perf_arch, NAN);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
@@ -420,35 +426,90 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 	return 0;
 }
 
-static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
+static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	struct timespec codelet_end;
 
 	int profiling = starpu_profiling_status_get();
 
 	_starpu_set_current_task(NULL);
-	args->current_task = NULL;
+	worker->current_tasks[worker->first_task] = NULL;
+	worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
+	worker->ntasks--;
 
-	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
+	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, &args->perf_arch, &j->cl_start, &codelet_end, profiling);
+	_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 
 	_starpu_handle_job_termination(j);
 }
 
+/* Execute a job, up to completion for synchronous jobs */
+static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker *worker)
+{
+	int workerid = worker->workerid;
+	int res;
+
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+
+	res = start_job_on_cuda(j, worker);
+
+	if (res)
+	{
+		switch (res)
+		{
+			case -EAGAIN:
+				_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
+				_starpu_push_task_to_workers(task);
+				STARPU_ABORT();
+			default:
+				STARPU_ABORT();
+		}
+	}
+
+#ifndef STARPU_SIMGRID
+	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+	{
+		/* Record event to synchronize with task termination later */
+		cudaEventRecord(task_events[workerid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE], starpu_cuda_get_local_stream());
+#ifdef STARPU_USE_FXT
+		int k;
+		for (k = 0; k < (int) worker->set->nworkers; k++)
+			if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
+				break;
+		if (k == (int) worker->set->nworkers)
+			/* Everybody busy */
+			_STARPU_TRACE_START_EXECUTING()
+#endif
+	}
+	else
+#else
+#ifdef STARPU_DEVEL
+#warning No CUDA asynchronous execution with simgrid yet.
+#endif
+#endif
+	/* Synchronous execution */
+	{
+#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
+		STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
+#endif
+		finish_job_on_cuda(j, worker);
+	}
+}
+
 /* XXX Should this be merged with _starpu_init_cuda ? */
 int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 {
-	struct _starpu_worker *args = &worker_set->workers[0];
-	unsigned devid = args->devid;
+	struct _starpu_worker *worker0 = &worker_set->workers[0];
+	unsigned devid = worker0->devid;
 	unsigned i;
 
-	_starpu_worker_start(args, _STARPU_FUT_CUDA_KEY);
+	_starpu_worker_start(worker0, _STARPU_FUT_CUDA_KEY);
 
 #ifdef STARPU_USE_FXT
-	unsigned memnode = args->memory_node;
+	unsigned memnode = worker0->memory_node;
 	for (i = 1; i < worker_set->nworkers; i++)
 	{
 		struct _starpu_worker *worker = &worker_set->workers[i];
@@ -458,17 +519,20 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
 #ifndef STARPU_SIMGRID
 	init_context(worker_set, devid);
+
+	if (worker_set->nworkers > 1 && props[devid].concurrentKernels == 0)
+		_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
 #endif
 
 	_starpu_cuda_limit_gpu_mem_if_needed(devid);
-	_starpu_memory_manager_set_global_memory_size(args->memory_node, _starpu_cuda_get_global_mem_size(devid));
+	_starpu_memory_manager_set_global_memory_size(worker0->memory_node, _starpu_cuda_get_global_mem_size(devid));
 
-	_starpu_malloc_init(args->memory_node);
+	_starpu_malloc_init(worker0->memory_node);
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(args->config, args->bindid);
+	_starpu_bind_thread_on_cpu(worker0->config, worker0->bindid);
 
-	args->status = STATUS_UNKNOWN;
+	worker0->status = STATUS_UNKNOWN;
 
 	float size = (float) global_mem[devid] / (1<<30);
 #ifdef STARPU_SIMGRID
@@ -479,29 +543,31 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 	strncpy(devname, props[devid].name, 128);
 #endif
 
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
-	if (props[devid].pciDomainID)
-		snprintf(args->name, sizeof(args->name), "CUDA %u (%s %.1f GiB %04x:%02x:%02x.0)", devid, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
-	else
+		if (props[devid].pciDomainID)
+			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, i, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
+		else
 #endif
-		snprintf(args->name, sizeof(args->name), "CUDA %u (%s %.1f GiB %02x:%02x.0)", devid, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
+			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %02x:%02x.0)", devid, i, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
 #else
-	snprintf(args->name, sizeof(args->name), "CUDA %u (%s %.1f GiB)", devid, devname, size);
+		snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB)", devid, i, devname, size);
 #endif
-	snprintf(args->short_name, sizeof(args->short_name), "CUDA %u", devid);
-	_STARPU_DEBUG("cuda (%s) dev id %u thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
+		snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u.%u", devid, i);
+		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
 
-	for (i = 0; i < worker_set->nworkers; i++)
-	{
+		worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
 		_STARPU_TRACE_WORKER_INIT_END(worker_set->workers[i].workerid);
 	}
 
 	/* tell the main thread that this one is ready */
-	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
-	args->worker_is_initialized = 1;
-	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker0->mutex);
+	worker0->worker_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker0->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker0->mutex);
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
@@ -527,19 +593,20 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	idle = 0;
 	for (i = 0; i < (int) worker_set->nworkers; i++)
 	{
-		struct _starpu_worker *args = &worker_set->workers[i];
-		int workerid = args->workerid;
-
-		task = args->current_task;
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		int workerid = worker->workerid;
 
-		if (!task)
+		if (!worker->ntasks)
 		{
 			idle++;
+			/* Even nothing to test */
 			continue;
 		}
 
+		task = worker->current_tasks[worker->first_task];
+
 		/* On-going asynchronous task, check for its termination first */
-		cudaError_t cures = cudaEventQuery(task_events[workerid]);
+		cudaError_t cures = cudaEventQuery(task_events[workerid][worker->first_task]);
 
 		if (cures != cudaSuccess)
 		{
@@ -548,19 +615,46 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		else
 		{
 			/* Asynchronous task completed! */
-			_starpu_set_local_worker_key(args);
-			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
-			idle++;
+			_starpu_set_local_worker_key(worker);
+			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
+			/* See next task if any */
+			if (worker->ntasks)
+			{
+				task = worker->current_tasks[worker->first_task];
+				j = _starpu_get_job_associated_to_task(task);
+				if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+				{
+					/* An asynchronous task, it was already
+					 * queued, it's now running, record its start time.  */
+					_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get());
+				}
+				else
+				{
+					/* A synchronous task, we have finished
+					 * flushing the pipeline, we can now at
+					 * last execute it.  */
+
+					_STARPU_TRACE_END_PROGRESS(memnode);
+					_STARPU_TRACE_EVENT("sync_task");
+					execute_job_on_cuda(task, worker);
+					_STARPU_TRACE_EVENT("end_sync_task");
+					_STARPU_TRACE_START_PROGRESS(memnode);
+					worker->pipeline_stuck = 0;
+				}
+			}
 #ifdef STARPU_USE_FXT
 			int k;
 			for (k = 0; k < (int) worker_set->nworkers; k++)
-				if (worker_set->workers[k].current_task)
+				if (worker_set->workers[k].ntasks)
 					break;
 			if (k == (int) worker_set->nworkers)
 				/* Everybody busy */
 				_STARPU_TRACE_END_EXECUTING()
 #endif
 		}
+
+		if (worker->ntasks < worker->pipeline_length)
+			idle++;
 	}
 
 	if (!idle)
@@ -582,14 +676,12 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
 	for (i = 0; i < (int) worker_set->nworkers; i++)
 	{
-		struct _starpu_worker *args = &worker_set->workers[i];
-		int workerid = args->workerid;
+		struct _starpu_worker *worker = &worker_set->workers[i];
 
 		task = tasks[i];
 		if (!task)
 			continue;
 
-		_starpu_set_local_worker_key(args);
 
 		j = _starpu_get_job_associated_to_task(task);
 
@@ -597,54 +689,24 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		if (!_STARPU_CUDA_MAY_PERFORM(j))
 		{
 			/* this is neither a cuda or a cublas task */
+			worker->ntasks--;
 			_starpu_push_task_to_workers(task);
 			continue;
 		}
 
-		_STARPU_TRACE_END_PROGRESS(memnode);
-		res = start_job_on_cuda(j, args);
-
-		if (res)
+		if (worker->ntasks > 1 && !(task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC))
 		{
-			switch (res)
-			{
-				case -EAGAIN:
-					_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
-					_starpu_push_task_to_workers(task);
-					STARPU_ABORT();
-				default:
-					STARPU_ABORT();
-			}
+			/* We have to execute a non-asynchronous task but we
+			 * still have tasks in the pipeline...  Record it to
+			 * prevent more tasks from coming, and do it later */
+			worker->pipeline_stuck = 1;
+			continue;
 		}
 
-#ifndef STARPU_SIMGRID
-		if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
-		{
-			/* Record event to synchronize with task termination later */
-			cudaEventRecord(task_events[workerid], starpu_cuda_get_local_stream());
-#ifdef STARPU_USE_FXT
-			int k;
-			for (k = 0; k < (int) worker_set->nworkers; k++)
-				if (worker_set->workers[k].current_task)
-					break;
-			if (k < (int) worker_set->nworkers)
-				/* Everybody busy */
-				_STARPU_TRACE_START_EXECUTING()
-#endif
-		}
-		else
-#else
-#ifdef STARPU_DEVEL
-#warning No CUDA asynchronous execution with simgrid yet.
-#endif
-#endif
-		/* Synchronous execution */
-		{
-#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
-			STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
-#endif
-			finish_job_on_cuda(j, args);
-		}
+		_starpu_set_local_worker_key(worker);
+
+		_STARPU_TRACE_END_PROGRESS(memnode);
+		execute_job_on_cuda(task, worker);
 		_STARPU_TRACE_START_PROGRESS(memnode);
 	}
 
@@ -653,8 +715,8 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
 int _starpu_cuda_driver_deinit(struct _starpu_worker_set *arg)
 {
-	struct _starpu_worker *args = &arg->workers[0];
-	unsigned memnode = args->memory_node;
+	struct _starpu_worker *worker = &arg->workers[0];
+	unsigned memnode = worker->memory_node;
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
 	_starpu_handle_all_pending_node_data_requests(memnode);
@@ -675,16 +737,16 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *arg)
 	return 0;
 }
 
-void *_starpu_cuda_worker(void *arg)
+void *_starpu_cuda_worker(void *_arg)
 {
-	struct _starpu_worker_set* args = arg;
+	struct _starpu_worker_set* worker = _arg;
 
-	_starpu_cuda_driver_init(args);
+	_starpu_cuda_driver_init(worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
-		_starpu_cuda_driver_run_once(args);
+		_starpu_cuda_driver_run_once(worker);
 	_STARPU_TRACE_END_PROGRESS(memnode);
-	_starpu_cuda_driver_deinit(args);
+	_starpu_cuda_driver_deinit(worker);
 
 	return NULL;
 }

+ 48 - 35
src/drivers/driver_common/driver_common.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -33,13 +33,13 @@
 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
 #define BACKOFF_MIN 1
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_start, int rank, int profiling)
+void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_start, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
 	struct starpu_profiling_task_info *profiling_info;
 	int starpu_top=_starpu_top_status_get();
-	int workerid = args->workerid;
+	int workerid = worker->workerid;
 	unsigned calibrate_model = 0;
 
 	if (cl->model && cl->model->benchmarking)
@@ -51,7 +51,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	if (j->task_size == 1)
 		_starpu_sched_pre_exec_hook(task);
 
-	args->status = STATUS_EXECUTING;
+	worker->status = STATUS_EXECUTING;
 	task->status = STARPU_TASK_RUNNING;
 
 	if (rank == 0)
@@ -76,13 +76,13 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
 }
 
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
+void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
 	struct starpu_profiling_task_info *profiling_info = task->profiling_info;
 	int starpu_top=_starpu_top_status_get();
-	int workerid = args->workerid;
+	int workerid = worker->workerid;
 	unsigned calibrate_model = 0;
 
 	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
@@ -102,16 +102,16 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	if (starpu_top)
 		_starpu_top_task_ended(task,workerid,codelet_end);
 
-	args->status = STATUS_UNKNOWN;
+	worker->status = STATUS_UNKNOWN;
 }
-void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
+void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker,
 					struct starpu_perfmodel_arch* perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
 {
 	struct starpu_profiling_task_info *profiling_info = j->task->profiling_info;
 	struct timespec measured_ts;
 	double measured;
-	int workerid = worker_args->workerid;
+	int workerid = worker->workerid;
 	struct starpu_codelet *cl = j->task->cl;
 	int calibrate_model = 0;
 	int updated = 0;
@@ -141,7 +141,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 		}
 
 		if (calibrate_model)
-			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker_args->devid, measured,j->nimpl);
+			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker->devid, measured,j->nimpl);
 
 
 	}
@@ -151,7 +151,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking)
 	{
-		_starpu_update_perfmodel_history(j, j->task->cl->power_model, perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
+		_starpu_update_perfmodel_history(j, j->task->cl->power_model, perf_arch, worker->devid, profiling_info->power_consumed,j->nimpl);
 	}
 }
 
@@ -198,12 +198,12 @@ static void _starpu_worker_set_status_wakeup(int workerid)
 }
 
 
-static void _starpu_exponential_backoff(struct _starpu_worker *args)
+static void _starpu_exponential_backoff(struct _starpu_worker *worker)
 {
-	int delay = args->spinning_backoff;
+	int delay = worker->spinning_backoff;
 	
-	if (args->spinning_backoff < BACKOFF_MAX)
-		args->spinning_backoff<<=1; 
+	if (worker->spinning_backoff < BACKOFF_MAX)
+		worker->spinning_backoff<<=1; 
 	
 	while(delay--)
 		STARPU_UYIELD();
@@ -212,17 +212,18 @@ static void _starpu_exponential_backoff(struct _starpu_worker *args)
 
 
 /* Workers may block when there is no work to do at all. */
-struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode)
+struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int workerid, unsigned memnode)
 {
-	STARPU_PTHREAD_MUTEX_LOCK(&args->sched_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 	struct starpu_task *task;
 	unsigned needed = 1;
+
 	_starpu_worker_set_status_scheduling(workerid);
 	while(needed)
 	{
 		struct _starpu_sched_ctx *sched_ctx = NULL;
 		struct _starpu_sched_ctx_list *l = NULL;
-		for (l = args->sched_ctx_list; l; l = l->next)
+		for (l = worker->sched_ctx_list; l; l = l->next)
 		{
 			sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
@@ -233,13 +234,13 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 					/* don't let the worker sleep with the sched_mutex taken */
 					/* we need it until here bc of the list of ctxs of the workers
 					   that can change in another thread */
-					STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 					needed = 0;
 					_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
 					STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
 					_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
 					sched_ctx->parallel_sect[workerid] = 0;
-					STARPU_PTHREAD_MUTEX_LOCK(&args->sched_mutex);
+					STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 				}
 				STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 			}
@@ -247,19 +248,19 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 				break;
 		}
 		/* don't worry if the value is not correct (no lock) it will do it next time */
-		if(args->tmp_sched_ctx != -1)
+		if(worker->tmp_sched_ctx != -1)
 		{
-			sched_ctx = _starpu_get_sched_ctx_struct(args->tmp_sched_ctx);
+			sched_ctx = _starpu_get_sched_ctx_struct(worker->tmp_sched_ctx);
 			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 			if(sched_ctx->parallel_sect[workerid])
 			{
 //				needed = 0;
-				STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 				_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
 				STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
 				_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
 				sched_ctx->parallel_sect[workerid] = 0;
-				STARPU_PTHREAD_MUTEX_LOCK(&args->sched_mutex);
+				STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 			}
 			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 		}
@@ -267,7 +268,10 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 		needed = !needed;
 	}
 
-	task = _starpu_pop_task(args);
+	if (worker->pipeline_length && (worker->ntasks == worker->pipeline_length || worker->pipeline_stuck))
+		task = NULL;
+	else
+		task = _starpu_pop_task(worker);
 
 	if (task == NULL)
 	{
@@ -278,17 +282,17 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
 		_starpu_worker_set_status_sleeping(workerid);
 
-		if (_starpu_worker_can_block(memnode) && !_starpu_sched_ctx_last_worker_awake(args))
+		if (_starpu_worker_can_block(memnode) && !_starpu_sched_ctx_last_worker_awake(worker))
 		{
-			STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 		}
 		else
 		{
-			STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);			
+			STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 			if (_starpu_machine_is_running())
 			{
-				_starpu_exponential_backoff(args);
+				_starpu_exponential_backoff(worker);
 #ifdef STARPU_SIMGRID
 				static int warned;
 				if (!warned)
@@ -307,9 +311,9 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 	_starpu_worker_set_status_scheduling_done(workerid);
 
 	_starpu_worker_set_status_wakeup(workerid);
-	args->spinning_backoff = BACKOFF_MIN;
+	worker->spinning_backoff = BACKOFF_MIN;
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 
 
 #ifdef HAVE_AYUDAME_H
@@ -333,8 +337,11 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 	/*for each worker*/
 	for (i = 0; i < nworkers; i++)
 	{
-		/*if the worker is already executinf a task then */
-		if(workers[i].current_task)
+		/*if the worker is already executing a task then */
+		if((workers[i].pipeline_length == 0 && workers[i].current_task)
+			|| (workers[i].pipeline_length != 0 &&
+				(workers[i].ntasks == workers[i].pipeline_length
+				 || workers[i].pipeline_stuck)))
 		{
 			tasks[i] = NULL;
 		}
@@ -354,7 +361,13 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 				count ++;
 				j = _starpu_get_job_associated_to_task(tasks[i]);
 				is_parallel_task = (j->task_size > 1);
-				workers[i].current_task = j->task;
+				if (workers[i].pipeline_length)
+				{
+					workers[i].current_tasks[(workers[i].first_task + workers[i].ntasks)%STARPU_MAX_PIPELINE] = tasks[i];
+					workers[i].ntasks++;
+				}
+				else
+					workers[i].current_task = j->task;
 				/* Get the rank in case it is a parallel task */
 				if (is_parallel_task)
 				{

+ 140 - 90
src/drivers/opencl/driver_opencl.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -50,7 +50,7 @@ static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
-static cl_event task_events[STARPU_MAXOPENCLDEVS];
+static cl_event task_events[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
 #endif
 
 void
@@ -562,28 +562,29 @@ void _starpu_opencl_init(void)
 #ifndef STARPU_SIMGRID
 static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
 #endif
-static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *args);
-static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *args);
+static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker);
+static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker);
+static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker);
 
-int _starpu_opencl_driver_init(struct _starpu_worker *args)
+int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 {
-	int devid = args->devid;
+	int devid = worker->devid;
 
-	_starpu_worker_start(args, _STARPU_FUT_OPENCL_KEY);
+	_starpu_worker_start(worker, _STARPU_FUT_OPENCL_KEY);
 
 #ifndef STARPU_SIMGRID
 	_starpu_opencl_init_context(devid);
 #endif
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(args->config, args->bindid);
+	_starpu_bind_thread_on_cpu(worker->config, worker->bindid);
 
 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
-	_starpu_memory_manager_set_global_memory_size(args->memory_node, _starpu_opencl_get_global_mem_size(devid));
+	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
 
-	_starpu_malloc_init(args->memory_node);
+	_starpu_malloc_init(worker->memory_node);
 
-	args->status = STATUS_UNKNOWN;
+	worker->status = STATUS_UNKNOWN;
 	float size = (float) global_mem[devid] / (1<<30);
 
 #ifdef STARPU_SIMGRID
@@ -593,42 +594,44 @@ int _starpu_opencl_driver_init(struct _starpu_worker *args)
 	char devname[128];
 	_starpu_opencl_get_device_name(devid, devname, 128);
 #endif
-	snprintf(args->name, sizeof(args->name), "OpenCL %u (%s %.1f GiB)", devid, devname, size);
-	snprintf(args->short_name, sizeof(args->short_name), "OpenCL %u", devid);
+	snprintf(worker->name, sizeof(worker->name), "OpenCL %u (%s %.1f GiB)", devid, devname, size);
+	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
 
-	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
+	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
 
-	_STARPU_TRACE_WORKER_INIT_END(args->workerid);
+	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
+
+	_STARPU_TRACE_WORKER_INIT_END(worker->workerid);
 
 	/* tell the main thread that this one is ready */
-	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
-	args->worker_is_initialized = 1;
-	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
+	worker->worker_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
 
 	return 0;
 }
 
-int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
+int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 {
-	int workerid = args->workerid;
-	unsigned memnode = args->memory_node;
+	int workerid = worker->workerid;
+	unsigned memnode = worker->memory_node;
 
 	struct _starpu_job *j;
 	struct starpu_task *task;
-	int res;
 
 #ifndef STARPU_SIMGRID
-	task = starpu_task_get_current();
-
-	if (task)
+	if (worker->ntasks)
 	{
 		cl_int status;
 		size_t size;
 		int err;
+
 		/* On-going asynchronous task, check for its termination first */
 
-		err = clGetEventInfo(task_events[args->devid], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
+		task = worker->current_tasks[worker->first_task];
+
+		err = clGetEventInfo(task_events[worker->devid][worker->first_task], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
 		STARPU_ASSERT(size == sizeof(cl_int));
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -640,17 +643,39 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 			return 0;
 		}
 
+		task_events[worker->devid][worker->first_task] = 0;
+
 		/* Asynchronous task completed! */
+		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), worker);
+		/* See next task if any */
+		if (worker->ntasks)
+		{
+			task = worker->current_tasks[worker->first_task];
+			j = _starpu_get_job_associated_to_task(task);
+			if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+			{
+				/* An asynchronous task, it was already queued,
+				 * it's now running, record its start time.  */
+				_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get());
+			}
+			else
+			{
+				/* A synchronous task, we have finished flushing the pipeline, we can now at last execute it.  */
+				_STARPU_TRACE_END_PROGRESS(memnode);
+				_STARPU_TRACE_EVENT("sync_task");
+				_starpu_opencl_execute_job(task, worker);
+				_STARPU_TRACE_EVENT("end_sync_task");
+				_STARPU_TRACE_START_PROGRESS(memnode);
+				worker->pipeline_stuck = 0;
+			}
+		}
 		_STARPU_TRACE_END_EXECUTING();
-		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
 	}
 #endif /* STARPU_SIMGRID */
 
 	__starpu_datawizard_progress(memnode, 1, 1);
 
-	_STARPU_TRACE_END_PROGRESS(memnode);
-
-	task = _starpu_get_worker_task(args, workerid, memnode);
+	task = _starpu_get_worker_task(worker, workerid, memnode);
 
 	if (task == NULL)
 		return 0;
@@ -665,62 +690,30 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		return 0;
 	}
 
-	res = _starpu_opencl_start_job(j, args);
+	worker->current_tasks[(worker->first_task  + worker->ntasks)%STARPU_MAX_PIPELINE] = task;
+	worker->ntasks++;
 
-	if (res)
+	if (worker->ntasks > 1 && !(task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC))
 	{
-		switch (res)
-		{
-			case -EAGAIN:
-				_STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
-				_starpu_push_task_to_workers(task);
-				STARPU_ABORT();
-				return 0;
-			default:
-				STARPU_ABORT();
-		}
+		/* We have to execute a non-asynchronous task but we
+		 * still have tasks in the pipeline...  Record it to
+		 * prevent more tasks from coming, and do it later */
+		worker->pipeline_stuck = 1;
+		return 0;
 	}
 
-#ifndef STARPU_SIMGRID
-	if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
-	{
-		/* Record event to synchronize with task termination later */
-		int err;
-		cl_command_queue queue;
-		starpu_opencl_get_queue(args->devid, &queue);
-		/* the function clEnqueueMarker is deprecated from
-		 * OpenCL version 1.2. We would like to use the new
-		 * function clEnqueueMarkerWithWaitList. We could do
-		 * it by checking its availability through our own
-		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
-		 * and the OpenCL macro CL_VERSION_1_2. However these
-		 * 2 macros detect the function availability in the
-		 * ICD and not in the device implementation.
-		 */
-		err = clEnqueueMarker(queue, &task_events[args->devid]);
-		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
-		_STARPU_TRACE_START_EXECUTING();
-	}
-	else
-#else
-#ifdef STARPU_DEVEL
-#warning No OpenCL asynchronous execution with simgrid yet.
-#endif
-#endif
-	/* Synchronous execution */
-	{
-		_starpu_opencl_stop_job(j, args);
-	}
+	_STARPU_TRACE_END_PROGRESS(memnode);
+	_starpu_opencl_execute_job(task, worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 
 	return 0;
 }
 
-int _starpu_opencl_driver_deinit(struct _starpu_worker *args)
+int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
 {
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
-	unsigned memnode = args->memory_node;
+	unsigned memnode = worker->memory_node;
 
 	_starpu_handle_all_pending_node_data_requests(memnode);
 
@@ -732,7 +725,7 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *args)
 	_starpu_malloc_shutdown(memnode);
 
 #ifndef STARPU_SIMGRID
-	unsigned devid   = args->devid;
+	unsigned devid   = worker->devid;
         _starpu_opencl_deinit_context(devid);
 #endif
 
@@ -741,15 +734,15 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *args)
 	return 0;
 }
 
-void *_starpu_opencl_worker(void *arg)
+void *_starpu_opencl_worker(void *_arg)
 {
-	struct _starpu_worker* args = arg;
+	struct _starpu_worker* worker = _arg;
 
-	_starpu_opencl_driver_init(args);
+	_starpu_opencl_driver_init(worker);
 	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
-		_starpu_opencl_driver_run_once(args);
-	_starpu_opencl_driver_deinit(args);
+		_starpu_opencl_driver_run_once(worker);
+	_starpu_opencl_driver_deinit(worker);
 	_STARPU_TRACE_END_PROGRESS(memnode);
 
 	return NULL;
@@ -802,7 +795,7 @@ cl_device_type _starpu_opencl_get_device_type(int devid)
 }
 #endif /* STARPU_USE_OPENCL */
 
-static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *args)
+static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	int ret;
 
@@ -816,7 +809,6 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 	STARPU_ASSERT(cl);
 
 	_starpu_set_current_task(j->task);
-	args->current_task = j->task;
 
 	ret = _starpu_fetch_task_input(j);
 	if (ret != 0)
@@ -827,7 +819,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
+	if (worker->ntasks == 1)
+	{
+		/* We are alone in the pipeline, the kernel will start now, record it */
+		_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
+	}
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
@@ -850,7 +846,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
 		length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
 	  #endif
-		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
+		_starpu_simgrid_execute_job(j, &worker->perf_arch, length);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
@@ -859,18 +855,19 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 	return 0;
 }
 
-static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *args)
+static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker)
 {
 	struct timespec codelet_end;
 	int profiling = starpu_profiling_status_get();
 
 	_starpu_set_current_task(NULL);
-	args->current_task = NULL;
+	worker->current_tasks[worker->first_task] = NULL;
+	worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
+	worker->ntasks--;
 
-	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
+	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, &args->perf_arch,
-					   &j->cl_start, &codelet_end, profiling);
+	_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 
@@ -878,6 +875,59 @@ static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker
 
 }
 
+static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker)
+{
+	int res;
+
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+
+	res = _starpu_opencl_start_job(j, worker);
+
+	if (res)
+	{
+		switch (res)
+		{
+			case -EAGAIN:
+				_STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
+				_starpu_push_task_to_workers(task);
+				STARPU_ABORT();
+			default:
+				STARPU_ABORT();
+		}
+	}
+
+#ifndef STARPU_SIMGRID
+	if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+	{
+		/* Record event to synchronize with task termination later */
+		int err;
+		cl_command_queue queue;
+		starpu_opencl_get_queue(worker->devid, &queue);
+		/* the function clEnqueueMarker is deprecated from
+		 * OpenCL version 1.2. We would like to use the new
+		 * function clEnqueueMarkerWithWaitList. We could do
+		 * it by checking its availability through our own
+		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
+		 * and the OpenCL macro CL_VERSION_1_2. However these
+		 * 2 macros detect the function availability in the
+		 * ICD and not in the device implementation.
+		 */
+		err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]);
+		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+		_STARPU_TRACE_START_EXECUTING();
+	}
+	else
+#else
+#ifdef STARPU_DEVEL
+#warning No OpenCL asynchronous execution with simgrid yet.
+#endif
+#endif
+	/* Synchronous execution */
+	{
+		_starpu_opencl_stop_job(j, worker);
+	}
+}
+
 #ifdef STARPU_USE_OPENCL
 int _starpu_run_opencl(struct _starpu_worker *workerarg)
 {

+ 2 - 2
src/profiling/bound.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -227,7 +227,7 @@ static void new_task(struct _starpu_job *j)
 	t->tag_id = j->task->tag_id;
 	t->use_tag = j->task->use_tag;
 	t->cl = j->task->cl;
-	t->footprint = _starpu_compute_buffers_footprint(NULL, STARPU_CPU_WORKER, 0, j);
+	t->footprint = _starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, STARPU_CPU_WORKER, 0, j);
 	t->priority = j->task->priority;
 	t->deps = NULL;
 	t->depsn = 0;

+ 7 - 2
src/top/starpu_top_connection.c

@@ -19,9 +19,14 @@
 
 #include <starpu_config.h>
 
-#ifdef STARPU_HAVE_WINDOWS
+#ifdef __MINGW__
 #  include <w32api.h>
+#  define WINVER WindowsXP
+#endif
+
+#ifdef STARPU_HAVE_WINDOWS
 #  include <ws2tcpip.h>
+#  include <io.h>
 #else
 #  include <sys/socket.h>
 #  include <netinet/in.h>
@@ -122,7 +127,7 @@ void _starpu_top_communications_threads_launcher(void)
    	}
   	int sock=socket(ans->ai_family, ans->ai_socktype, ans->ai_protocol);
 	int optval = 1;
-	setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval));
+	setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void*) &optval, sizeof(optval));
 
 	if (bind(sock, ans->ai_addr, ans->ai_addrlen) < 0)
 	{

+ 14 - 18
src/worker_collection/worker_tree.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013  Université de Bordeaux 1
- * Copyright (C) 2012-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012-2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011-2013  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,7 +29,7 @@ static unsigned tree_has_next(struct starpu_worker_collection *workers, struct s
 
 	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
 	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->present);
-	
+
 	if(!neighbour)
 	{
 		starpu_tree_reset_visited(tree, it->visited);
@@ -58,7 +58,7 @@ static unsigned tree_has_next(struct starpu_worker_collection *workers, struct s
 static int tree_get_next(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 	int ret = -1;
-	
+
 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
 	struct starpu_tree *neighbour = NULL;
 	if(it->possible_value)
@@ -68,10 +68,10 @@ static int tree_get_next(struct starpu_worker_collection *workers, struct starpu
 	}
 	else
 		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->present);
-	
+
 	STARPU_ASSERT_MSG(neighbour, "no element anymore");
-	
-	
+
+
 	int workerids[STARPU_NMAXWORKERS];
 	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
 	int w;
@@ -97,7 +97,7 @@ static unsigned tree_has_next_master(struct starpu_worker_collection *workers, s
 
 	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
 	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
-	
+
 	if(!neighbour)
 	{
 		starpu_tree_reset_visited(tree, it->visited);
@@ -126,7 +126,7 @@ static unsigned tree_has_next_master(struct starpu_worker_collection *workers, s
 static int tree_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 	int ret = -1;
-	
+
 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
 	struct starpu_tree *neighbour = NULL;
 	if(it->possible_value)
@@ -136,10 +136,10 @@ static int tree_get_next_master(struct starpu_worker_collection *workers, struct
 	}
 	else
 		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
-	
+
 	STARPU_ASSERT_MSG(neighbour, "no element anymore");
-	
-	
+
+
 	int workerids[STARPU_NMAXWORKERS];
 	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
 	int w;
@@ -160,23 +160,19 @@ static int tree_get_next_master(struct starpu_worker_collection *workers, struct
 
 static int tree_add(struct starpu_worker_collection *workers, int worker)
 {
-	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
-
 	if(!workers->present[worker])
 	{
 		workers->present[worker] = 1;
 		workers->nworkers++;
 		return worker;
 	}
-	else 
+	else
 		return -1;
 }
 
 
 static int tree_remove(struct starpu_worker_collection *workers, int worker)
 {
-	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
-
 	if(workers->present[worker])
 	{
 		workers->present[worker] = 0;
@@ -184,7 +180,7 @@ static int tree_remove(struct starpu_worker_collection *workers, int worker)
 		workers->nworkers--;
 		return worker;
 	}
-	else 
+	else
 		return -1;
 }
 
@@ -200,7 +196,7 @@ static void tree_init(struct starpu_worker_collection *workers)
 		workers->present[i] = 0;
 		workers->is_master[i] = 0;
 	}
-	
+
 	return;
 }
 

+ 6 - 0
tests/main/driver_api/run_driver.c

@@ -80,6 +80,8 @@ test_cpu(void)
 	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)
 	{
 		FPRINTF(stderr, "WARNING: No CPU worker found\n");
+		if (ret == 0)
+			starpu_shutdown();
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -138,6 +140,8 @@ test_cuda(void)
 	if (ret == -ENODEV || starpu_cuda_worker_get_count() == 0)
 	{
 		FPRINTF(stderr, "WARNING: No CUDA worker found\n");
+		if (ret == 0)
+			starpu_shutdown();
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -222,6 +226,8 @@ test_opencl(void)
 	if (ret == -ENODEV || starpu_opencl_worker_get_count() == 0)
 	{
 		FPRINTF(stderr, "WARNING: No OpenCL workers found\n");
+		if (ret == 0)
+			starpu_shutdown();
 		return STARPU_TEST_SKIPPED;
 	}
 

+ 41 - 0
tests/main/subgraph_repeat_regenerate_tag.c

@@ -51,6 +51,9 @@ static unsigned niter = 16384;
 static struct starpu_task taskA, taskB, taskC, taskD;
 
 static unsigned loop_cnt = 0;
+static unsigned loop_cnt_A = 0;
+static unsigned loop_cnt_B = 0;
+static unsigned loop_cnt_C = 0;
 static unsigned *check_cnt;
 static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
@@ -78,6 +81,39 @@ static struct starpu_codelet dummy_codelet =
 	.nbuffers = 1
 };
 
+static void callback_task_A(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_A++;
+
+	if (loop_cnt_A == niter)
+	{
+		/* We are done */
+		taskA.regenerate = 0;
+	}
+}
+
+static void callback_task_B(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_B++;
+
+	if (loop_cnt_B == niter)
+	{
+		/* We are done */
+		taskB.regenerate = 0;
+	}
+}
+
+static void callback_task_C(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_C++;
+
+	if (loop_cnt_C == niter)
+	{
+		/* We are done */
+		taskC.regenerate = 0;
+	}
+}
+
 static void callback_task_D(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -127,6 +163,7 @@ int main(int argc, char **argv)
 	taskA.regenerate = 1; /* this task will be explicitely resubmitted if needed */
 	taskA.use_tag = 1;
 	taskA.tag_id = TAG_A;
+	taskA.callback_func = callback_task_A;
 	taskA.handles[0] = check_data;
 
 	starpu_task_init(&taskB);
@@ -136,6 +173,7 @@ int main(int argc, char **argv)
 	taskB.regenerate = 1;
 	taskB.use_tag = 1;
 	taskB.tag_id = TAG_B;
+	taskB.callback_func = callback_task_B;
 	taskB.handles[0] = check_data;
 
 	starpu_task_init(&taskC);
@@ -145,6 +183,7 @@ int main(int argc, char **argv)
 	taskC.regenerate = 1;
 	taskC.use_tag = 1;
 	taskC.tag_id = TAG_C;
+	taskC.callback_func = callback_task_C;
 	taskC.handles[0] = check_data;
 
 	starpu_task_init(&taskD);
@@ -184,6 +223,8 @@ int main(int argc, char **argv)
 
 	starpu_free(check_cnt);
 
+	starpu_data_unregister(check_data);
+
 	starpu_shutdown();
 
 	/* Cleanup the statically allocated tasks after shutdown, as StarPU is still working on it after the callback */

+ 28 - 0
tests/main/subgraph_repeat_tag.c

@@ -44,6 +44,8 @@ static unsigned niter = 16384;
 static struct starpu_task taskA, taskB, taskC, taskD;
 
 static unsigned loop_cnt = 0;
+static unsigned loop_cnt_B = 0;
+static unsigned loop_cnt_C = 0;
 static unsigned *check_cnt;
 static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
@@ -71,6 +73,28 @@ static struct starpu_codelet dummy_codelet =
 	.nbuffers = 1
 };
 
+static void callback_task_B(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_B++;
+
+	if (loop_cnt_B == niter)
+	{
+		/* We are done */
+		taskB.regenerate = 0;
+	}
+}
+
+static void callback_task_C(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	loop_cnt_C++;
+
+	if (loop_cnt_C == niter)
+	{
+		/* We are done */
+		taskC.regenerate = 0;
+	}
+}
+
 static void callback_task_D(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -126,6 +150,7 @@ int main(int argc, char **argv)
 	taskB.cl_arg = &taskB;
 	taskB.cl_arg_size = sizeof(&taskB);
 	taskB.regenerate = 1;
+	taskB.callback_func = callback_task_B;
 	taskB.handles[0] = check_data;
 
 	starpu_task_init(&taskC);
@@ -133,6 +158,7 @@ int main(int argc, char **argv)
 	taskC.cl_arg = &taskC;
 	taskC.cl_arg_size = sizeof(&taskC);
 	taskC.regenerate = 1;
+	taskC.callback_func = callback_task_C;
 	taskC.handles[0] = check_data;
 
 	starpu_task_init(&taskD);
@@ -168,6 +194,8 @@ int main(int argc, char **argv)
 
 	starpu_free(check_cnt);
 
+	starpu_data_unregister(check_data);
+
 	starpu_shutdown();
 
 	/* Cleanup the statically allocated tasks after shutdown, as StarPU is still working on it after the callback */

+ 2 - 2
tests/microbenchs/matrix_as_vector.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -186,7 +186,7 @@ int check_size_on_device(uint32_t where, char *device_name)
 	matrix_codelet.nbuffers = 1;
 	if (where == STARPU_CPU) matrix_codelet.cpu_funcs[0] = matrix_cpu_func;
 	if (where == STARPU_CUDA) matrix_codelet.cuda_funcs[0] = matrix_cuda_func;
-	if (where == STARPU_CUDA) vector_codelet.cuda_flags[0] = STARPU_CUDA_ASYNC;
+	if (where == STARPU_CUDA) matrix_codelet.cuda_flags[0] = STARPU_CUDA_ASYNC;
 //	if (where == STARPU_OPENCL) matrix_codelet.opencl_funcs[0] = matrix_opencl_func;
 
 	for(nx=NX_MIN ; nx<=NX_MAX ; nx*=2)

+ 34 - 8
tests/overlap/gpu_concurrency.c

@@ -24,27 +24,48 @@
 #include <common/thread.h>
 
 #define NITERS 1000000
-#define NTASKS 128
+#define NTASKS 64
+#define SYNC 16
 
 #ifdef STARPU_USE_CUDA
 extern void long_kernel_cuda(unsigned long niters);
-void codelet_long_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+
+void codelet_long_kernel_async(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	long_kernel_cuda(NITERS);
+}
+
+void codelet_long_kernel_sync(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	long_kernel_cuda(NITERS);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
-static struct starpu_perfmodel model =
+static struct starpu_perfmodel model_async =
 {
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "long_kernel",
+	.symbol = "long_kernel_async",
 };
 
-static struct starpu_codelet cl =
+static struct starpu_perfmodel model_sync =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "long_kernel_sync",
+};
+
+static struct starpu_codelet cl_async =
 {
-	.cuda_funcs = {codelet_long_kernel, NULL},
+	.cuda_funcs = {codelet_long_kernel_async, NULL},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 0,
-	.model =  &model
+	.model =  &model_async,
+};
+
+static struct starpu_codelet cl =
+{
+	.cuda_funcs = {codelet_long_kernel_sync, NULL},
+	.nbuffers = 0,
+	.model =  &model_sync,
 };
 #endif
 
@@ -53,6 +74,7 @@ int main(int argc, char **argv)
 #ifndef STARPU_USE_CUDA
 	return STARPU_TEST_SKIPPED;
 #else
+	setenv("STARPU_NWORKER_PER_CUDA", "4", 1);
 	int ret = starpu_initialize(NULL, &argc, &argv);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -66,7 +88,11 @@ int main(int argc, char **argv)
 	for (iter = 0; iter < NTASKS; iter++)
 	{
 		struct starpu_task *task = starpu_task_create();
-		task->cl = &cl;
+
+		if (!(iter % SYNC))
+			task->cl = &cl;
+		else
+			task->cl = &cl_async;
 
 		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;

+ 41 - 25
tools/gdbinit

@@ -1,7 +1,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010-2013  Université de Bordeaux 1
+# Copyright (C) 2010-2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -30,7 +30,7 @@ define starpu-print-job
     printf "\tsubmitted:\t\t\t<%d>\n", $job->submitted
     printf "\tterminated:\t\t\t<%d>\n", $job->terminated
     printf "\tjob_id:\t\t\t\t<%d>\n", $job->job_id
-    if $job->task
+    if $job->task && $job->task->name
         printf "\tname:\t\t\t\t<%s>\n", $job->task->name
     end
   end
@@ -71,7 +71,9 @@ define starpu-print-task
   end
 
   printf "StarPU Task (%p)\n", $task
-  printf "\tname:\t\t\t\t<%s>\n", $task->name
+  if $task->name
+    printf "\tname:\t\t\t\t<%s>\n", $task->name
+  end
   printf "\tcodelet:\t\t\t<%p>\n", $task->cl
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
@@ -90,6 +92,32 @@ define starpu-print-task
   end
 end
 
+define starpu-print-task-and-successor
+  set language c
+  set $t = (struct starpu_task *) ($arg0)
+  starpu-print-task $t
+  set $j = (struct _starpu_job *) $t->starpu_private
+  set $nsuccs = $j->job_successors.nsuccs
+  set $i = 0
+  while $i < $nsuccs
+    set $cg = $j->job_successors.succ[$i]
+    if ($cg->cg_type == 1)
+      # STARPU_CG_APPS
+      printf "waited for by application"
+    end
+    if ($cg->cg_type == 2)
+      # STARPU_CG_TAG
+      printf "will produce tag %x\n", $cg->succ.tag
+    end
+    if ($cg->cg_type == 4)
+      # STARPU_CG_TASK
+      printf "dep of task %p\n", $cg->succ.job
+      starpu-print-task $cg->succ.job->task
+    end
+    set $i = $i + 1
+  end
+end
+
 document starpu-print-task
 Prints a StarPU task
 end
@@ -150,30 +178,18 @@ define starpu-tasks
   printf "Tasks being run:\n"
   set $n = 0
   while $n < config.topology.nworkers
+    printf "worker %d %s:\n", $n, config.workers[$n].short_name
+    if config.workers[$n].pipeline_length > 0
+      set $m = 0
+      while $m < config.workers[$n].ntasks
+        set $t = config.workers[$n].current_tasks[(config.workers[$n].first_task + $m) % (sizeof(config.workers[$n].current_tasks)/sizeof(config.workers[$n].current_tasks[0]))]
+        starpu-print-task-and-successor $t
+        set $m = $m + 1
+      end
+    end
     set $task = config.workers[$n].current_task
     if ($task)
-      printf "worker %d:\n", $n
-      starpu-print-task $task
-      set $j = (struct _starpu_job *) $task->starpu_private
-      set $nsuccs = $j->job_successors.nsuccs
-      set $i = 0
-      while $i < $nsuccs
-        set $cg = $j->job_successors.succ[$i]
-	if ($cg->cg_type == 1)
-	  # STARPU_CG_APPS
-	  printf "waited for by application"
-	end
-	if ($cg->cg_type == 2)
-	  # STARPU_CG_TAG
-	  printf "will produce tag %x\n", $cg->succ.tag
-	end
-	if ($cg->cg_type == 4)
-	  # STARPU_CG_TASK
-	  printf "dep of task %p\n", $cg->succ.job
-	  starpu-print-task $cg->succ.job->task
-	end
-        set $i = $i + 1
-      end
+      starpu-print-task-and-successor $task
     end
     set $n = $n + 1
   end

+ 1 - 1
tools/starpu_calibrate_bus.c

@@ -17,7 +17,7 @@
 #include <config.h>
 #include <starpu.h>
 #include <stdio.h>
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 

+ 4 - 4
tools/starpu_fxt_stats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -37,7 +37,7 @@ static uint64_t transfers[16][16];
 
 #define PROGNAME "starpu_fxt_stat"
 
-static void usage(char **argv)
+static void usage()
 {
 	fprintf(stderr, "Parse the log generated by FxT\n\n");
 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
@@ -73,7 +73,7 @@ static void parse_args(int argc, char **argv, char **fin, char **fout)
 
 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
-			usage(argv);
+			usage();
 			exit(EXIT_SUCCESS);
 		}
 
@@ -87,7 +87,7 @@ static void parse_args(int argc, char **argv, char **fin, char **fout)
 	if (!*fin)
 	{
 		fprintf(stderr, "Incorrect usage, aborting\n");
-                usage(argv);
+                usage();
 		exit(77);
 	}
 }

+ 18 - 12
tools/starpu_fxt_tool.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2014  Universite de Bordeaux 1
- * Copyright (C) 2012-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012-2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 
 #define PROGNAME "starpu_fxt_tool"
 
-static void usage(char **argv)
+static void usage()
 {
 	fprintf(stderr, "Generate a trace in the Paje format\n\n");
 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
@@ -53,40 +53,46 @@ static void parse_args(int argc, char **argv)
 	unsigned reading_input_filenames = 0;
 
 	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-c") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-c") == 0)
+		{
 			options.per_task_colour = 1;
 			reading_input_filenames = 0;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-o") == 0) {
+		if (strcmp(argv[i], "-o") == 0)
+		{
 			options.out_paje_path = argv[++i];
 			reading_input_filenames = 0;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-i") == 0) {
+		if (strcmp(argv[i], "-i") == 0)
+		{
 			options.filenames[options.ninputfiles++] = argv[++i];
 			reading_input_filenames = 1;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-no-counter") == 0) {
+		if (strcmp(argv[i], "-no-counter") == 0)
+		{
 			options.no_counter = 1;
 			reading_input_filenames = 0;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-no-bus") == 0) {
+		if (strcmp(argv[i], "-no-bus") == 0)
+		{
 			options.no_bus = 1;
 			reading_input_filenames = 0;
 			continue;
 		}
 
-		if (strcmp(argv[i], "-h") == 0
-		 || strcmp(argv[i], "--help") == 0) {
-			usage(argv);
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			usage();
 			exit(EXIT_SUCCESS);
 		}
 
@@ -109,7 +115,7 @@ static void parse_args(int argc, char **argv)
 	if (!options.ninputfiles)
 	{
 		fprintf(stderr, "Incorrect usage, aborting\n");
-                usage(argv);
+                usage();
 		exit(77);
 	}
 }

+ 2 - 2
tools/starpu_perfmodel_display.c

@@ -24,7 +24,7 @@
 
 #include <starpu.h>
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 
@@ -141,7 +141,7 @@ static void parse_args(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 	WSADATA wsadata;
 	WSAStartup(MAKEWORD(1,0), &wsadata);
 #endif

+ 17 - 5
tools/starpu_perfmodel_plot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2014  Université de Bordeaux 1
- * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,7 +31,7 @@
 #include <core/perfmodel/perfmodel.h> // we need to browse the list associated to history-based models
 #include <core/workers.h>
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 #include <windows.h>
 #endif
 
@@ -170,6 +170,18 @@ static void parse_args(int argc, char **argv)
 
 }
 
+static char *replace_char(char *str, char old, char new)
+{
+	char *p = strdup(str);
+	char *ptr = p;
+	while (*ptr)
+	{
+		if (*ptr == old) *ptr = new;
+		ptr ++;
+	}
+	return p;
+}
+
 static void print_comma(FILE *gnuplot_file, int *first)
 {
 	if (*first)
@@ -308,7 +320,7 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 						if (arch_model->list)
 						{
 							print_comma(gnuplot_file, first);
-							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, arch_name);
+							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, replace_char(arch_name, '_', '-'));
 							col += 2;
 						}
 					}
@@ -503,7 +515,7 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 	fprintf(gnuplot_file, "\n");
 	fprintf(gnuplot_file, "set term postscript eps enhanced color\n");
 	fprintf(gnuplot_file, "set output \"starpu_%s.eps\"\n", symbol);
-	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", symbol);
+	fprintf(gnuplot_file, "set title \"Model for codelet %s\"\n", replace_char(symbol, '_', '-'));
 	fprintf(gnuplot_file, "set xlabel \"Total data size\"\n");
 	if (gflops)
 		fprintf(gnuplot_file, "set ylabel \"GFlops\"\n");
@@ -637,7 +649,7 @@ int main(int argc, char **argv)
 	int ret;
 	struct starpu_perfmodel model = {};
 
-#ifdef __MINGW32__
+#if defined(_WIN32) && !defined(__CYGWIN__)
 	WSADATA wsadata;
 	WSAStartup(MAKEWORD(1,0), &wsadata);
 #endif