15 years ago · 24b42e1038
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -836,6 +836,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 @i{    int id, devid, err;}
			
 
				 @i{    cl_kernel kernel;}
			
 
				 @i{    cl_command_queue queue;}
			
 
				+@i{    cl_event event;}
			
 
				 
			
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
@@ -857,11 +858,13 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 @i{    @{}
			
 
				 @i{        size_t global=1;}
			
 
				 @i{        size_t local=1;}
			
 
				-@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);}
			
 
				+@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);}
			
 
				 @i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				 @i{    @}}
			
 
				 
			
 
				 @i{    clFinish(queue);}
			
 
				+@i{    starpu_opencl_collect_stats(event);}
			
 
				+@i{    clReleaseEvent(event);}
			
 
				 
			
 
				 @i{    starpu_opencl_release_kernel(kernel);}
			
 
				 @}
			
@@ -1262,10 +1265,43 @@ More advanced examples include:
 
				 @node Performance options
			
 
				 @chapter Performance options worth knowing
			
 
				 
			
 
				-TODO: explain why execution should be tried with
			
 
				-@code{STARPU_PREFETCH=1 STARPU_SCHED=dmda}, when to use
			
 
				-@code{STARPU_CALIBRATE=2} to force re-calibration, and how to play with
			
 
				-@code{STARPU_BETA=2} or more.
			
 
				+TODO: improve!
			
 
				+
			
 
				+By default, StarPU uses a simple greedy scheduler. To improve performance,
			
 
				+you should change the scheduler thanks to the @code{STARPU_SCHED} environment
			
 
				+variable. For instancel @code{export STARPU_SCHED=dmda} . Use @code{help}
			
 
				+to get the list of available schedulers.
			
 
				+
			
 
				+By default, StarPU does not enable data prefetching, because CUDA does
			
 
				+not announce when too many data transfers were scheduled and can thus block
			
 
				+unexpectedly... To enable data prefetching, use @code{export STARPU_PREFETCH=1}
			
 
				+.
			
 
				+
			
 
				+StarPU will automatically calibrate codelets which have never been calibrated
			
 
				+yet. To force continuing calibration, use @code{export STARPU_CALIBRATE=1}
			
 
				+. To drop existing calibration information completely and re-calibrate from
			
 
				+start, use @code{export STARPU_CALIBRATE=2}.
			
 
				+
			
 
				+Distributing tasks to balance the load induces data transfer penalty. StarPU
			
 
				+thus needs to find a balance between both. The target function that StarPU
			
 
				+tries to optimise is @code{alpha * T_execution + beta * T_data_transfer}, where
			
 
				+@code{T_execution} is the estimated execution time of the codelet (usually
			
 
				+accurate), and @code{T_data_transfer} is the estimated data transfer time. The
			
 
				+latter is however estimated based on bus calibration before execution start,
			
 
				+i.e. with an idle machine. When StarPU manages several GPUs, such estimation
			
 
				+is not accurate any more. Beta can then be used to correct this by hand. For
			
 
				+instance, you can use @code{export STARPU_BETA=2} to double the transfer
			
 
				+time estimation, e.g. because there are two GPUs in the machine. This is of
			
 
				+course imprecise, but in practice, a rough estimation already gives the good
			
 
				+results that a precise estimation would give.
			
 
				+
			
 
				+Measuring the actual data transfer time is however on our TODO-list to
			
 
				+accurately estimate data transfer penalty without the need of a hand-tuned beta parameter.
			
 
				+
			
 
				+Profiling can be enabled by using @code{export STARPU_PROFILING=1} or by
			
 
				+calling @code{starpu_profiling_status_set} from the source code.
			
 
				+Statistics on the execution can then be obtained by using @code{export
			
 
				+STARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} .
			
 
				 
			
 
				 @c ---------------------------------------------------------------------
			
 
				 @c Performance feedback
			
@@ -3496,6 +3532,7 @@ This forces sampling the bus performance model again.
 
				 * starpu_timing_timespec_delay_us::  
			
 
				 * starpu_timing_timespec_to_us::  
			
 
				 * starpu_bus_profiling_helper_display_summary::  
			
 
				+* starpu_worker_profiling_helper_display_summary::  
			
 
				 @end menu
			
 
				 
			
 
				 @node starpu_profiling_status_set
			
@@ -3662,6 +3699,15 @@ TODO
 
				 @code{void starpu_bus_profiling_helper_display_summary(void);}
			
 
				 @end table
			
 
				 
			
 
				+@node starpu_worker_profiling_helper_display_summary
			
 
				+@subsection @code{starpu_worker_profiling_helper_display_summary}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+TODO
			
 
				+@item @emph{Prototype}:
			
 
				+@code{void starpu_worker_profiling_helper_display_summary(void);}
			
 
				+@end table
			
 
				+
			
 
				 
			
 
				 
			
 
				 @node CUDA extensions
			
@@ -3718,9 +3764,10 @@ This function synchronously deinitializes the CUBLAS library on every CUDA devic
 
				 @section OpenCL extensions
			
 
				 
			
 
				 @menu
			
 
				-* Enabling OpenCL::             Enabling OpenCL
			
 
				+* Enabling OpenCL::            Enabling OpenCL
			
 
				 * Compiling OpenCL kernels::   Compiling OpenCL kernels
			
 
				 * Loading OpenCL kernels::     Loading OpenCL kernels
			
 
				+* OpenCL statistics::          Collecting statistics from OpenCL
			
 
				 @end menu
			
 
				 
			
 
				 @node Enabling OpenCL
			
@@ -3821,6 +3868,25 @@ TODO
 
				 @code{int starpu_opencl_release_kernel(cl_kernel kernel);}
			
 
				 @end table
			
 
				 
			
 
				+@node OpenCL statistics
			
 
				+@subsection OpenCL statistics
			
 
				+
			
 
				+@menu
			
 
				+* starpu_opencl_collect_stats::   Collect statistics on a kernel execution
			
 
				+@end menu
			
 
				+
			
 
				+@node starpu_opencl_collect_stats
			
 
				+@subsubsection @code{starpu_opencl_collect_stats} -- Collect statistics on a kernel execution
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+After termination of the kernels, the OpenCL codelet should call this function
			
 
				+to pass it the even returned by @code{clEnqueueNDRangeKernel}, to let StarPU
			
 
				+collect statistics about the kernel execution (used cycles, consumed power).
			
 
				+@item @emph{Prototype}:
			
 
				+@code{int starpu_opencl_collect_stats(cl_event event);}
			
 
				+@end table
			
 
				+
			
 
				+
			
 
				 @node Cell extensions
			
 
				 @section Cell extensions
			
 
				 
			
--- a/doc/vector_scal_opencl.texi
+++ b/doc/vector_scal_opencl.texi
@@ -9,6 +9,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				     int id, devid, err;
			
 
				     cl_kernel kernel;
			
 
				     cl_command_queue queue;
			
 
				+    cl_event event;
			
 
				 
			
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
@@ -40,11 +41,13 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				         if (local > global) local=global;
			
 
				 
			
 
				         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
			
 
				-                                     NULL, NULL);
			
 
				+                                     NULL, &event);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				     @}
			
 
				 
			
 
				     clFinish(queue);
			
 
				+    starpu_opencl_collect_stats(event);
			
 
				+    clReleaseEvent(event);
			
 
				 
			
 
				     starpu_opencl_release_kernel(kernel);
			
 
				 @}
			
--- a/examples/basic_examples/block_opencl.c
+++ b/examples/basic_examples/block_opencl.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,6 +23,7 @@ void opencl_codelet(void *descr[], void *_args)
 
				 {
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 	int id, devid, err, n;
			
 
				 	float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]);
			
 
				 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
			
@@ -51,11 +52,13 @@ void opencl_codelet(void *descr[], void *_args)
 
				 
			
 
				 	{
			
 
				                 size_t global=nx*ny*nz;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 
			
 
				         starpu_opencl_release_kernel(kernel);
			
 
				 }
			
--- a/examples/basic_examples/variable_kernels_opencl.c
+++ b/examples/basic_examples/variable_kernels_opencl.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,6 +23,7 @@ void opencl_codelet(void *descr[], void *_args)
 
				 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 	int id, devid, err;
			
 
				 
			
 
				         id = starpu_worker_get_id();
			
@@ -38,11 +39,13 @@ void opencl_codelet(void *descr[], void *_args)
 
				 	{
			
 
				 		size_t global=1;
			
 
				 		size_t local=1;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
--- a/examples/basic_examples/vector_scal_c.c
+++ b/examples/basic_examples/vector_scal_c.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,6 +20,8 @@
 
				  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
			
 
				  *  2- how to describe which data are accessed by a task (task->buffers[0])
			
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				+ *
			
 
				+ * This is a variant of vector_scal.c which shows it can be integrated with fortran.
			
 
				  */
			
 
				 
			
 
				 #include "f77.h"
			
@@ -33,6 +35,11 @@ extern void scal_cpu_func(void *buffers[], void *_args);
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				 //extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				 
			
 
				+static struct starpu_perfmodel_t vector_scal_model = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "vector_scale_model"
			
 
				+};
			
 
				+
			
 
				 static starpu_codelet cl = {
			
 
				 	.where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
			
 
				 	/* CPU implementation of the codelet */
			
@@ -45,7 +52,8 @@ static starpu_codelet cl = {
 
				 	/* OpenCL implementation of the codelet */
			
 
				 	//.opencl_func = scal_opencl_func,
			
 
				 	//#endif
			
 
				-	.nbuffers = 1
			
 
				+	.nbuffers = 1,
			
 
				+	.model = &vector_scal_model
			
 
				 };
			
 
				 
			
 
				 //#ifdef STARPU_USE_OPENCL
			
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -30,6 +30,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				         cl_int err;
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 
			
 
				 	/* length of the vector */
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
@@ -59,11 +60,13 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global) local=global;
			
 
				 
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
--- a/examples/filters/fblock_opencl.c
+++ b/examples/filters/fblock_opencl.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,6 +24,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 
				 	int id, devid, err;
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 
			
 
				         int *factor = cl_arg;
			
 
				 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
			
@@ -51,11 +52,13 @@ void opencl_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 	{
			
 
				 		size_t global=nx*ny*nz;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
--- a/examples/incrementer/incrementer_kernels_opencl.c
+++ b/examples/incrementer/incrementer_kernels_opencl.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,6 +24,7 @@ void opencl_codelet(void *descr[], void *_args)
 
				 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 	int id, devid, err;
			
 
				 
			
 
				         id = starpu_worker_get_id();
			
@@ -46,11 +47,13 @@ void opencl_codelet(void *descr[], void *_args)
 
				                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global) local=global;
			
 
				 
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -235,6 +235,7 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
				 
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 
			
 
				 	int id = starpu_worker_get_id();
			
 
				 	int devid = starpu_worker_get_devid(id);
			
@@ -254,8 +255,10 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
				 	unsigned dim = 16;
			
 
				 	size_t local[2] = {dim, 1};
			
 
				 	size_t global[2] = {width, block_size};
			
 
				-	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
			
 
				+	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, &event);
			
 
				 	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
 
				 #endif
			
--- a/examples/matvecmult/matvecmult.c
+++ b/examples/matvecmult/matvecmult.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -29,8 +29,9 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				 	float *matrix = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				 	float *vector = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 	float *mult = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				-        int nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				-        int ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	int nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	int ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	cl_event event;
			
 
				 
			
 
				         id = starpu_worker_get_id();
			
 
				         devid = starpu_worker_get_devid(id);
			
@@ -47,13 +48,15 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				         if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	{
			
 
				-                size_t global=nx*ny;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			
 
				+		size_t global=nx*ny;
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				 
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
 
				 #endif
			
--- a/examples/spmv/dw_spmv.c
+++ b/examples/spmv/dw_spmv.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -34,6 +34,7 @@ void spmv_kernel_opencl(void *descr[], void *args)
 
				 {
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 	int id, devid, err, n;
			
 
				 
			
 
				 	uint32_t nnz = STARPU_CSR_GET_NNZ(descr[0]);
			
@@ -71,11 +72,13 @@ void spmv_kernel_opencl(void *descr[], void *args)
 
				 
			
 
				 	{
			
 
				                 size_t global=1024;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 
			
 
				         starpu_opencl_release_kernel(kernel);
			
 
				 }
			
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -191,6 +191,8 @@ int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
 
				 int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, const char *kernel_name, int devid);
			
 
				 int starpu_opencl_release_kernel(cl_kernel kernel);
			
 
				 
			
 
				+int starpu_opencl_collect_stats(cl_event event);
			
 
				+
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_profiling.h
+++ b/include/starpu_profiling.h
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -57,6 +57,10 @@ struct starpu_task_profiling_info {
 
				 
			
 
				 	/* TODO add expected length, expected start/end ? */
			
 
				 	int workerid;
			
 
				+
			
 
				+	uint64_t used_cycles;
			
 
				+	uint64_t stall_cycles;
			
 
				+	double power_consumed;
			
 
				 };
			
 
				 
			
 
				 /* The timing is provided since the previous call to starpu_worker_get_profiling_info */
			
@@ -66,6 +70,10 @@ struct starpu_worker_profiling_info {
 
				 	struct timespec executing_time;
			
 
				 	struct timespec sleeping_time;
			
 
				 	int executed_tasks;
			
 
				+
			
 
				+	uint64_t used_cycles;
			
 
				+	uint64_t stall_cycles;
			
 
				+	double power_consumed;
			
 
				 };
			
 
				 
			
 
				 struct starpu_bus_profiling_info {
			
@@ -157,6 +165,7 @@ double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *
 
				 double starpu_timing_timespec_to_us(struct timespec *ts);
			
 
				 
			
 
				 void starpu_bus_profiling_helper_display_summary(void);
			
 
				+void starpu_worker_profiling_helper_display_summary(void);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -132,6 +132,8 @@ struct starpu_task {
 
				 
			
 
				 	unsigned status;
			
 
				 
			
 
				+	/* This gets filled when profiling is enabled by using
			
 
				+	 * starpu_profiling_status_set */
			
 
				 	struct starpu_task_profiling_info *profiling_info;
			
 
				 
			
 
				 	/* Predicted duration of the task. This field is only valid if the
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,6 +1,6 @@
 
				 #
			
 
				 # StarPU
			
 
				-# Copyright (C) Université Bordeaux 1, CNRS 2008-2009 (see AUTHORS file)
			
 
				+# Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				 #
			
 
				 # This program is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -185,7 +185,7 @@ libstarpu_la_SOURCES = 						\
 
				 	debug/structures_size.c					\
			
 
				 	profiling/profiling.c					\
			
 
				 	profiling/bound.c					\
			
 
				-	profiling/bus_profiling_helpers.c
			
 
				+	profiling/profiling_helpers.c
			
 
				 
			
 
				 if STARPU_USE_CPU
			
 
				 libstarpu_la_SOURCES += drivers/cpu/driver_cpu.c
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -237,14 +237,15 @@ int starpu_task_submit(struct starpu_task *task)
 
				 	/* If profiling is activated, we allocate a structure to store the
			
 
				 	 * appropriate info. */
			
 
				 	struct starpu_task_profiling_info *info;
			
 
				-	info = _starpu_allocate_profiling_info_if_needed();
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				+	info = _starpu_allocate_profiling_info_if_needed(task);
			
 
				 	task->profiling_info = info;
			
 
				 
			
 
				 	/* The task is considered as block until we are sure there remains not
			
 
				 	 * dependency. */
			
 
				 	task->status = STARPU_TASK_BLOCKED;
			
 
				 	
			
 
				-	if (info)
			
 
				+	if (profiling)
			
 
				 		starpu_clock_gettime(&info->submit_time);
			
 
				 
			
 
				 	/* internally, StarPU manipulates a starpu_job_t which is a wrapper around a
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -442,6 +442,7 @@ static void _starpu_kill_all_workers(struct starpu_machine_config_s *config)
 
				 
			
 
				 void starpu_shutdown(void)
			
 
				 {
			
 
				+	const char *stats;
			
 
				 	PTHREAD_MUTEX_LOCK(&init_mutex);
			
 
				 	init_count--;
			
 
				 	if (init_count)
			
@@ -460,6 +461,11 @@ void starpu_shutdown(void)
 
				 #ifdef STARPU_DATA_STATS
			
 
				 	_starpu_display_comm_amounts();
			
 
				 #endif
			
 
				+	if ((stats = getenv("STARPU_BUS_STATS")) && atoi(stats))
			
 
				+		starpu_bus_profiling_helper_display_summary();
			
 
				+
			
 
				+	if ((stats = getenv("STARPU_WORKER_STATS")) && atoi(stats))
			
 
				+		starpu_worker_profiling_helper_display_summary();
			
 
				 
			
 
				 	_starpu_deinitialize_registered_performance_models();
			
 
				 
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -57,12 +57,13 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				 
			
 
				 	struct starpu_task_profiling_info *profiling_info;
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				 		profiling_info = task->profiling_info;
			
 
				 	
			
 
				-		if (profiling_info || calibrate_model)
			
 
				+		if ((profiling && profiling_info) || calibrate_model)
			
 
				 		{
			
 
				 			starpu_clock_gettime(&codelet_start);
			
 
				 			_starpu_worker_register_executing_start_date(workerid, &codelet_start);
			
@@ -84,7 +85,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 	{
			
 
				 		cl->per_worker_stats[workerid]++;
			
 
				 		
			
 
				-		if (profiling_info || calibrate_model)
			
 
				+		if ((profiling && profiling_info) || calibrate_model)
			
 
				 			starpu_clock_gettime(&codelet_end);
			
 
				 	
			
 
				 		STARPU_TRACE_END_CODELET_BODY(j);
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -176,9 +176,10 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				 
			
 
				 	struct starpu_task_profiling_info *profiling_info;
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				 	profiling_info = task->profiling_info;
			
 
				 
			
 
				-	if (profiling_info || calibrate_model)
			
 
				+	if ((profiling && profiling_info) || calibrate_model)
			
 
				 	{
			
 
				 		starpu_clock_gettime(&codelet_start);
			
 
				 		_starpu_worker_register_executing_start_date(workerid, &codelet_start);
			
@@ -193,7 +194,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 
			
 
				 	cl->per_worker_stats[workerid]++;
			
 
				 
			
 
				-	if (profiling_info || calibrate_model)
			
 
				+	if ((profiling && profiling_info) || calibrate_model)
			
 
				 		starpu_clock_gettime(&codelet_end);
			
 
				 
			
 
				 	STARPU_TRACE_END_CODELET_BODY(j);	
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -29,25 +29,39 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 
				 {
			
 
				 	struct timespec measured_ts;
			
 
				 	double measured;
			
 
				+	int workerid = worker_args->workerid;
			
 
				 
			
 
				 	if (profiling_info || calibrate_model)
			
 
				 	{
			
 
				 		starpu_timespec_sub(codelet_end, codelet_start, &measured_ts);
			
 
				 		measured = starpu_timing_timespec_to_us(&measured_ts);
			
 
				 
			
 
				-		if (profiling_info)
			
 
				+		if (starpu_profiling_status_get())
			
 
				 		{
			
 
				-			memcpy(&profiling_info->start_time, codelet_start, sizeof(struct timespec));
			
 
				-			memcpy(&profiling_info->end_time, codelet_end, sizeof(struct timespec));
			
 
				+			if (profiling_info)
			
 
				+			{
			
 
				+				memcpy(&profiling_info->start_time, codelet_start, sizeof(struct timespec));
			
 
				+				memcpy(&profiling_info->end_time, codelet_end, sizeof(struct timespec));
			
 
				 
			
 
				-			int workerid = worker_args->workerid;
			
 
				-			profiling_info->workerid = workerid;
			
 
				-			
			
 
				-			_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1);
			
 
				+				profiling_info->workerid = workerid;
			
 
				+				
			
 
				+				_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1,
			
 
				+					profiling_info->used_cycles,
			
 
				+					profiling_info->stall_cycles,
			
 
				+					profiling_info->power_consumed);
			
 
				+			}
			
 
				+		} else {
			
 
				+			_starpu_worker_update_profiling_info_executing(workerid, 0, 1, 0, 0, 0);
			
 
				 		}
			
 
				 
			
 
				-		if (calibrate_model)
			
 
				+		if (calibrate_model) {
			
 
				+			if (profiling_info && profiling_info->power_consumed) {
			
 
				+				/* TODO: update power model history */
			
 
				+			}
			
 
				 			_starpu_update_perfmodel_history(j, perf_arch, worker_args->devid, measured);
			
 
				+		}
			
 
				+	} else {
			
 
				+		_starpu_worker_update_profiling_info_executing(workerid, 0, 1, 0, 0, 0);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -514,9 +514,10 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
				 	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				 
			
 
				 	struct starpu_task_profiling_info *profiling_info;
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				 	profiling_info = task->profiling_info;
			
 
				 
			
 
				-	if (profiling_info || calibrate_model)
			
 
				+	if ((profiling && profiling_info) || calibrate_model)
			
 
				 	{
			
 
				 		starpu_clock_gettime(&codelet_start);
			
 
				 		_starpu_worker_register_executing_start_date(workerid, &codelet_start);
			
@@ -531,7 +532,7 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
				 
			
 
				 	cl->per_worker_stats[workerid]++;
			
 
				 
			
 
				-	if (profiling_info || calibrate_model)
			
 
				+	if ((profiling && profiling_info) || calibrate_model)
			
 
				 		starpu_clock_gettime(&codelet_end);
			
 
				 
			
 
				 	STARPU_TRACE_END_CODELET_BODY(j);
			
--- a/src/drivers/opencl/driver_opencl_utils.c
+++ b/src/drivers/opencl/driver_opencl_utils.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,6 +23,7 @@
 
				 #include <sys/types.h>
			
 
				 
			
 
				 #include <starpu_opencl.h>
			
 
				+#include <starpu_profiling.h>
			
 
				 #include <core/workers.h>
			
 
				 #include "driver_opencl_utils.h"
			
 
				 #include "driver_opencl.h"
			
@@ -195,3 +196,47 @@ cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs
 
				         }
			
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				+
			
 
				+int starpu_opencl_collect_stats(cl_event event)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_get_current_task();
			
 
				+	struct starpu_task_profiling_info *info = task->profiling_info;
			
 
				+
			
 
				+#ifdef CL_PROFILING_CLOCK_CYCLE_COUNT
			
 
				+	if (starpu_profiling_status_get() && info) {
			
 
				+		cl_int err;
			
 
				+		unsigned int clock_cycle_count;
			
 
				+		size_t size;
			
 
				+		err = clGetEventProfilingInfo(event, CL_PROFILING_CLOCK_CYCLE_COUNT, sizeof(clock_cycle_count), &clock_cycle_count, &size);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		STARPU_ASSERT(size == sizeof(clock_cycle_count));
			
 
				+		info->used_cycles += clock_cycle_count;
			
 
				+	}
			
 
				+#endif
			
 
				+#ifdef CL_PROFILING_STALL_CYCLE_COUNT
			
 
				+	if (starpu_profiling_status_get() && info) {
			
 
				+		cl_int err;
			
 
				+		unsigned int stall_cycle_count;
			
 
				+		size_t size;
			
 
				+		err = clGetEventProfilingInfo(event, CL_PROFILING_STALL_CYCLE_COUNT, sizeof(stall_cycle_count), &stall_cycle_count, &size);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		STARPU_ASSERT(size == sizeof(stall_cycle_count));
			
 
				+
			
 
				+		info->stall_cycles += stall_cycle_count;
			
 
				+	}
			
 
				+#endif
			
 
				+#ifdef CL_PROFILING_POWER_CONSUMED
			
 
				+	if (info && (starpu_profiling_status_get() || (task->cl && task->cl->model && task->cl->model->benchmarking))) {
			
 
				+		cl_int err;
			
 
				+		double power_consumed;
			
 
				+		size_t size;
			
 
				+		err = clGetEventProfilingInfo(event, CL_PROFILING_POWER_CONSUMED, sizeof(power_consumed), &power_consumed, &size);
			
 
				+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		STARPU_ASSERT(size == sizeof(power_consumed));
			
 
				+
			
 
				+		info->power_consumed += power_consumed;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -94,11 +94,14 @@ int starpu_profiling_status_get(void)
 
				 void _starpu_profiling_init(void)
			
 
				 {
			
 
				 	int worker;
			
 
				+	const char *env;
			
 
				 	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
			
 
				 	{
			
 
				 		PTHREAD_MUTEX_INIT(&worker_info_mutex[worker], NULL);
			
 
				 		_starpu_worker_reset_profiling_info(worker);
			
 
				 	}
			
 
				+	if ((env = getenv("STARPU_PROFILING")) && atoi(env))
			
 
				+		profiling = 1;
			
 
				 }
			
 
				 
			
 
				 void starpu_profiling_terminate(void)
			
@@ -110,11 +113,12 @@ void starpu_profiling_terminate(void)
 
				  *	Task profiling
			
 
				  */
			
 
				 
			
 
				-struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(void)
			
 
				+struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(struct starpu_task *task)
			
 
				 {
			
 
				 	struct starpu_task_profiling_info *info = NULL;
			
 
				 
			
 
				-	if (profiling)
			
 
				+	/* If we are benchmarking, we need room for the power consumption */
			
 
				+	if (profiling || (task->cl && task->cl->model && task->cl->model->benchmarking))
			
 
				 	{
			
 
				 		info = calloc(1, sizeof(struct starpu_task_profiling_info));
			
 
				 		STARPU_ASSERT(info);
			
@@ -139,6 +143,10 @@ static void _do_starpu_worker_reset_profiling_info(int workerid)
 
				 	starpu_timespec_clear(&worker_info[workerid].sleeping_time);
			
 
				 
			
 
				 	worker_info[workerid].executed_tasks = 0;
			
 
				+
			
 
				+	worker_info[workerid].used_cycles = 0;
			
 
				+	worker_info[workerid].stall_cycles = 0;
			
 
				+	worker_info[workerid].power_consumed = 0;
			
 
				 	
			
 
				 	/* We detect if the worker is already sleeping or doing some
			
 
				 	 * computation */
			
@@ -221,7 +229,7 @@ void _starpu_worker_update_profiling_info_sleeping(int workerid, struct timespec
 
				 }
			
 
				 
			
 
				 
			
 
				-void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks)
			
 
				+void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double power_consumed)
			
 
				 {
			
 
				 	if (profiling)
			
 
				 	{
			
@@ -229,16 +237,23 @@ void _starpu_worker_update_profiling_info_executing(int workerid, struct timespe
 
				 
			
 
				 		starpu_timespec_accumulate(&worker_info[workerid].executing_time, executing_time);
			
 
				 
			
 
				+		worker_info[workerid].used_cycles += used_cycles;
			
 
				+		worker_info[workerid].stall_cycles += stall_cycles;
			
 
				+		worker_info[workerid].power_consumed += power_consumed;
			
 
				 		worker_info[workerid].executed_tasks += executed_tasks;
			
 
				 	
			
 
				 		PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]);
			
 
				-	}
			
 
				+	} else /* Not thread safe, shouldn't be too much a problem */
			
 
				+		worker_info[workerid].executed_tasks += executed_tasks;
			
 
				 }
			
 
				 
			
 
				 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *info)
			
 
				 {
			
 
				 	if (!profiling)
			
 
				-		return -EINVAL;
			
 
				+	{
			
 
				+		/* Not thread safe, shouldn't be too much a problem */
			
 
				+		info->executed_tasks = worker_info[workerid].executed_tasks;
			
 
				+	}
			
 
				 
			
 
				 	PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
			
 
				 
			
--- a/src/profiling/profiling.h
+++ b/src/profiling/profiling.h
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,14 +24,14 @@
 
				 
			
 
				 /* Create a task profiling info structure (with the proper time stamps) in case
			
 
				  * profiling is enabled. */
			
 
				-struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(void);
			
 
				+struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(struct starpu_task *task);
			
 
				 
			
 
				 /* Clear all the profiling info related to the worker. */
			
 
				 void _starpu_worker_reset_profiling_info(int workerid);
			
 
				 
			
 
				 /* Update the per-worker profiling info after a task (or more) was executed.
			
 
				  * This tells StarPU how much time was spent doing computation. */
			
 
				-void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks);
			
 
				+void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double consumed_power);
			
 
				 
			
 
				 /* Update the per-worker profiling info when StarPU wakes up: this indicates
			
 
				  * how much time was spent sleeping. */
			
--- a/src/profiling/bus_profiling_helpers.c
+++ b/src/profiling/bus_profiling_helpers.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,8 @@ void starpu_bus_profiling_helper_display_summary(void)
 
				 {
			
 
				 	int long long sum_transferred = 0;
			
 
				 
			
 
				-	fprintf(stderr, "Data transfer statistics:\n");
			
 
				+	fprintf(stderr, "\nData transfer statistics:\n");
			
 
				+	fprintf(stderr,   "*************************\n");
			
 
				 
			
 
				 	int busid;
			
 
				 	int bus_cnt = starpu_bus_get_count();
			
@@ -47,3 +48,38 @@ void starpu_bus_profiling_helper_display_summary(void)
 
				 
			
 
				 	fprintf(stderr, "Total transfers: %.2lf MB\n", (1.0*sum_transferred)/(1024*1024));
			
 
				 }
			
 
				+
			
 
				+void starpu_worker_profiling_helper_display_summary(void)
			
 
				+{
			
 
				+	double sum_consumed = 0.;
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				+	fprintf(stderr, "\nWorker statistics:\n");
			
 
				+	fprintf(stderr,   "******************\n");
			
 
				+
			
 
				+	int workerid;
			
 
				+	int worker_cnt = starpu_worker_get_count();
			
 
				+	for (workerid = 0; workerid < worker_cnt; workerid++)
			
 
				+	{
			
 
				+		struct starpu_worker_profiling_info info;
			
 
				+		starpu_worker_get_profiling_info(workerid, &info);
			
 
				+		char name[32];
			
 
				+
			
 
				+		starpu_worker_get_name(workerid, name, sizeof(name));
			
 
				+
			
 
				+		if (profiling) {
			
 
				+			double total_time = starpu_timing_timespec_to_us(&info.total_time) / 1000.;
			
 
				+			double executing_time = starpu_timing_timespec_to_us(&info.executing_time) / 1000.;
			
 
				+			double sleeping_time = starpu_timing_timespec_to_us(&info.sleeping_time) / 1000.;
			
 
				+
			
 
				+			fprintf(stderr, "%-32s\n", name);
			
 
				+			fprintf(stderr, "\t%d task(s)\n\ttotal: %.2lf ms executing: %.2lf ms sleeping: %.2lf\n\t%lu Mcy %lu Mcy stall\n\t%lf J consumed\n", info.executed_tasks, total_time, executing_time, sleeping_time, info.used_cycles/1000000, info.stall_cycles/1000000, info.power_consumed);
			
 
				+		} else {
			
 
				+			fprintf(stderr, "\t%-32s\tapproximately %d task(s)\n", name, info.executed_tasks);
			
 
				+		}
			
 
				+
			
 
				+		sum_consumed += info.power_consumed;
			
 
				+	}
			
 
				+
			
 
				+	if (profiling)
			
 
				+		fprintf(stderr, "Total consumption: %.2lf J\n", sum_consumed);
			
 
				+}
			
--- a/tests/datawizard/sync_and_notify_data_opencl.c
+++ b/tests/datawizard/sync_and_notify_data_opencl.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * StarPU
			
 
				- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
			
 
				+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,6 +25,7 @@ void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
 
				         unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 	int id, devid, err;
			
 
				 
			
 
				 	id = starpu_worker_get_id();
			
@@ -40,12 +41,13 @@ void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
 
				 	{
			
 
				 		size_t global=100;
			
 
				 		size_t local=100;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				-
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }
			
 
				 
			
@@ -54,6 +56,7 @@ void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
 
				 	unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				 	int id, devid, err;
			
 
				 
			
 
				 	id = starpu_worker_get_id();
			
@@ -69,11 +72,13 @@ void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
 
				 	{
			
 
				 		size_t global=100;
			
 
				 		size_t local=100;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 
			
 
				 	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				 
			
 
				 	starpu_opencl_release_kernel(kernel);
			
 
				 }