Browse Source

- Add starpu_opencl_collect_stats function to collect statistics from OpenCL
- Call it from opencl kernels.
- Add used_cycles, stall_cycles, and power_consumed fields in profiling information
- Accumulate them in the worker information
- Add STARPU_PROFILING which enables profiling automatically.
- Add starpu_worker_profiling_helper_display_summary, similar to starpu_bus_profiling_helper_display_summary
- Add STARPU_BUS_STATS and STARPU_WORKER_STATS which calls those automatically
- Allocate the task profiling info structure also when calibrating performance models (for the power consumption). Fix code which assumed profiling_info => profile activated.
- Document all that.

TODO: add a power model history

Samuel Thibault 14 years ago
parent
commit
24b42e1038

+ 72 - 6
doc/starpu.texi

@@ -836,6 +836,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 @i{    int id, devid, err;}
 @i{    cl_kernel kernel;}
 @i{    cl_command_queue queue;}
+@i{    cl_event event;}
 
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
@@ -857,11 +858,13 @@ void scal_opencl_func(void *buffers[], void *_args)
 @i{    @{}
 @i{        size_t global=1;}
 @i{        size_t local=1;}
-@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);}
+@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);}
 @i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
 @i{    @}}
 
 @i{    clFinish(queue);}
+@i{    starpu_opencl_collect_stats(event);}
+@i{    clReleaseEvent(event);}
 
 @i{    starpu_opencl_release_kernel(kernel);}
 @}
@@ -1262,10 +1265,43 @@ More advanced examples include:
 @node Performance options
 @chapter Performance options worth knowing
 
-TODO: explain why execution should be tried with
-@code{STARPU_PREFETCH=1 STARPU_SCHED=dmda}, when to use
-@code{STARPU_CALIBRATE=2} to force re-calibration, and how to play with
-@code{STARPU_BETA=2} or more.
+TODO: improve!
+
+By default, StarPU uses a simple greedy scheduler. To improve performance,
+you should change the scheduler thanks to the @code{STARPU_SCHED} environment
+variable. For instancel @code{export STARPU_SCHED=dmda} . Use @code{help}
+to get the list of available schedulers.
+
+By default, StarPU does not enable data prefetching, because CUDA does
+not announce when too many data transfers were scheduled and can thus block
+unexpectedly... To enable data prefetching, use @code{export STARPU_PREFETCH=1}
+.
+
+StarPU will automatically calibrate codelets which have never been calibrated
+yet. To force continuing calibration, use @code{export STARPU_CALIBRATE=1}
+. To drop existing calibration information completely and re-calibrate from
+start, use @code{export STARPU_CALIBRATE=2}.
+
+Distributing tasks to balance the load induces data transfer penalty. StarPU
+thus needs to find a balance between both. The target function that StarPU
+tries to optimise is @code{alpha * T_execution + beta * T_data_transfer}, where
+@code{T_execution} is the estimated execution time of the codelet (usually
+accurate), and @code{T_data_transfer} is the estimated data transfer time. The
+latter is however estimated based on bus calibration before execution start,
+i.e. with an idle machine. When StarPU manages several GPUs, such estimation
+is not accurate any more. Beta can then be used to correct this by hand. For
+instance, you can use @code{export STARPU_BETA=2} to double the transfer
+time estimation, e.g. because there are two GPUs in the machine. This is of
+course imprecise, but in practice, a rough estimation already gives the good
+results that a precise estimation would give.
+
+Measuring the actual data transfer time is however on our TODO-list to
+accurately estimate data transfer penalty without the need of a hand-tuned beta parameter.
+
+Profiling can be enabled by using @code{export STARPU_PROFILING=1} or by
+calling @code{starpu_profiling_status_set} from the source code.
+Statistics on the execution can then be obtained by using @code{export
+STARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} .
 
 @c ---------------------------------------------------------------------
 @c Performance feedback
@@ -3496,6 +3532,7 @@ This forces sampling the bus performance model again.
 * starpu_timing_timespec_delay_us::  
 * starpu_timing_timespec_to_us::  
 * starpu_bus_profiling_helper_display_summary::  
+* starpu_worker_profiling_helper_display_summary::  
 @end menu
 
 @node starpu_profiling_status_set
@@ -3662,6 +3699,15 @@ TODO
 @code{void starpu_bus_profiling_helper_display_summary(void);}
 @end table
 
+@node starpu_worker_profiling_helper_display_summary
+@subsection @code{starpu_worker_profiling_helper_display_summary}
+@table @asis
+@item @emph{Description}:
+TODO
+@item @emph{Prototype}:
+@code{void starpu_worker_profiling_helper_display_summary(void);}
+@end table
+
 
 
 @node CUDA extensions
@@ -3718,9 +3764,10 @@ This function synchronously deinitializes the CUBLAS library on every CUDA devic
 @section OpenCL extensions
 
 @menu
-* Enabling OpenCL::             Enabling OpenCL
+* Enabling OpenCL::            Enabling OpenCL
 * Compiling OpenCL kernels::   Compiling OpenCL kernels
 * Loading OpenCL kernels::     Loading OpenCL kernels
+* OpenCL statistics::          Collecting statistics from OpenCL
 @end menu
 
 @node Enabling OpenCL
@@ -3821,6 +3868,25 @@ TODO
 @code{int starpu_opencl_release_kernel(cl_kernel kernel);}
 @end table
 
+@node OpenCL statistics
+@subsection OpenCL statistics
+
+@menu
+* starpu_opencl_collect_stats::   Collect statistics on a kernel execution
+@end menu
+
+@node starpu_opencl_collect_stats
+@subsubsection @code{starpu_opencl_collect_stats} -- Collect statistics on a kernel execution
+@table @asis
+@item @emph{Description}:
+After termination of the kernels, the OpenCL codelet should call this function
+to pass it the even returned by @code{clEnqueueNDRangeKernel}, to let StarPU
+collect statistics about the kernel execution (used cycles, consumed power).
+@item @emph{Prototype}:
+@code{int starpu_opencl_collect_stats(cl_event event);}
+@end table
+
+
 @node Cell extensions
 @section Cell extensions
 

+ 4 - 1
doc/vector_scal_opencl.texi

@@ -9,6 +9,7 @@ void scal_opencl_func(void *buffers[], void *_args)
     int id, devid, err;
     cl_kernel kernel;
     cl_command_queue queue;
+    cl_event event;
 
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
@@ -40,11 +41,13 @@ void scal_opencl_func(void *buffers[], void *_args)
         if (local > global) local=global;
 
         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
-                                     NULL, NULL);
+                                     NULL, &event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
     @}
 
     clFinish(queue);
+    starpu_opencl_collect_stats(event);
+    clReleaseEvent(event);
 
     starpu_opencl_release_kernel(kernel);
 @}

+ 5 - 2
examples/basic_examples/block_opencl.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +23,7 @@ void opencl_codelet(void *descr[], void *_args)
 {
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 	int id, devid, err, n;
 	float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
@@ -51,11 +52,13 @@ void opencl_codelet(void *descr[], void *_args)
 
 	{
                 size_t global=nx*ny*nz;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
         starpu_opencl_release_kernel(kernel);
 }

+ 5 - 2
examples/basic_examples/variable_kernels_opencl.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +23,7 @@ void opencl_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 	int id, devid, err;
 
         id = starpu_worker_get_id();
@@ -38,11 +39,13 @@ void opencl_codelet(void *descr[], void *_args)
 	{
 		size_t global=1;
 		size_t local=1;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
 }

+ 10 - 2
examples/basic_examples/vector_scal_c.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
  *  2- how to describe which data are accessed by a task (task->buffers[0])
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
+ *
+ * This is a variant of vector_scal.c which shows it can be integrated with fortran.
  */
 
 #include "f77.h"
@@ -33,6 +35,11 @@ extern void scal_cpu_func(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
 //extern void scal_opencl_func(void *buffers[], void *_args);
 
+static struct starpu_perfmodel_t vector_scal_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "vector_scale_model"
+};
+
 static starpu_codelet cl = {
 	.where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
 	/* CPU implementation of the codelet */
@@ -45,7 +52,8 @@ static starpu_codelet cl = {
 	/* OpenCL implementation of the codelet */
 	//.opencl_func = scal_opencl_func,
 	//#endif
-	.nbuffers = 1
+	.nbuffers = 1,
+	.model = &vector_scal_model
 };
 
 //#ifdef STARPU_USE_OPENCL

+ 5 - 2
examples/basic_examples/vector_scal_opencl.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -30,6 +30,7 @@ void scal_opencl_func(void *buffers[], void *_args)
         cl_int err;
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 
 	/* length of the vector */
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
@@ -59,11 +60,13 @@ void scal_opencl_func(void *buffers[], void *_args)
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global) local=global;
 
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
 }

+ 5 - 2
examples/filters/fblock_opencl.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,6 +24,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 	int id, devid, err;
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 
         int *factor = cl_arg;
 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
@@ -51,11 +52,13 @@ void opencl_func(void *buffers[], void *cl_arg)
 
 	{
 		size_t global=nx*ny*nz;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
 }

+ 5 - 2
examples/incrementer/incrementer_kernels_opencl.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,6 +24,7 @@ void opencl_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 	int id, devid, err;
 
         id = starpu_worker_get_id();
@@ -46,11 +47,13 @@ void opencl_codelet(void *descr[], void *_args)
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global) local=global;
 
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
 }

+ 5 - 2
examples/mandelbrot/mandelbrot.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -235,6 +235,7 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 
 	int id = starpu_worker_get_id();
 	int devid = starpu_worker_get_devid(id);
@@ -254,8 +255,10 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 	unsigned dim = 16;
 	size_t local[2] = {dim, 1};
 	size_t global[2] = {width, block_size};
-	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, &event);
 	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 	starpu_opencl_release_kernel(kernel);
 }
 #endif

+ 8 - 5
examples/matvecmult/matvecmult.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,8 +29,9 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 	float *matrix = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	float *vector = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
 	float *mult = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
-        int nx = STARPU_MATRIX_GET_NX(descr[0]);
-        int ny = STARPU_MATRIX_GET_NY(descr[0]);
+	int nx = STARPU_MATRIX_GET_NX(descr[0]);
+	int ny = STARPU_MATRIX_GET_NY(descr[0]);
+	cl_event event;
 
         id = starpu_worker_get_id();
         devid = starpu_worker_get_devid(id);
@@ -47,13 +48,15 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
         if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 	{
-                size_t global=nx*ny;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		size_t global=nx*ny;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
 
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 	starpu_opencl_release_kernel(kernel);
 }
 #endif

+ 5 - 2
examples/spmv/dw_spmv.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -34,6 +34,7 @@ void spmv_kernel_opencl(void *descr[], void *args)
 {
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 	int id, devid, err, n;
 
 	uint32_t nnz = STARPU_CSR_GET_NNZ(descr[0]);
@@ -71,11 +72,13 @@ void spmv_kernel_opencl(void *descr[], void *args)
 
 	{
                 size_t global=1024;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
         starpu_opencl_release_kernel(kernel);
 }

+ 3 - 1
include/starpu_opencl.h

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -191,6 +191,8 @@ int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
 int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, const char *kernel_name, int devid);
 int starpu_opencl_release_kernel(cl_kernel kernel);
 
+int starpu_opencl_collect_stats(cl_event event);
+
 
 #ifdef __cplusplus
 }

+ 10 - 1
include/starpu_profiling.h

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -57,6 +57,10 @@ struct starpu_task_profiling_info {
 
 	/* TODO add expected length, expected start/end ? */
 	int workerid;
+
+	uint64_t used_cycles;
+	uint64_t stall_cycles;
+	double power_consumed;
 };
 
 /* The timing is provided since the previous call to starpu_worker_get_profiling_info */
@@ -66,6 +70,10 @@ struct starpu_worker_profiling_info {
 	struct timespec executing_time;
 	struct timespec sleeping_time;
 	int executed_tasks;
+
+	uint64_t used_cycles;
+	uint64_t stall_cycles;
+	double power_consumed;
 };
 
 struct starpu_bus_profiling_info {
@@ -157,6 +165,7 @@ double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *
 double starpu_timing_timespec_to_us(struct timespec *ts);
 
 void starpu_bus_profiling_helper_display_summary(void);
+void starpu_worker_profiling_helper_display_summary(void);
 
 #ifdef __cplusplus
 }

+ 3 - 1
include/starpu_task.h

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -132,6 +132,8 @@ struct starpu_task {
 
 	unsigned status;
 
+	/* This gets filled when profiling is enabled by using
+	 * starpu_profiling_status_set */
 	struct starpu_task_profiling_info *profiling_info;
 
 	/* Predicted duration of the task. This field is only valid if the

+ 2 - 2
src/Makefile.am

@@ -1,6 +1,6 @@
 #
 # StarPU
-# Copyright (C) Université Bordeaux 1, CNRS 2008-2009 (see AUTHORS file)
+# Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -185,7 +185,7 @@ libstarpu_la_SOURCES = 						\
 	debug/structures_size.c					\
 	profiling/profiling.c					\
 	profiling/bound.c					\
-	profiling/bus_profiling_helpers.c
+	profiling/profiling_helpers.c
 
 if STARPU_USE_CPU
 libstarpu_la_SOURCES += drivers/cpu/driver_cpu.c

+ 4 - 3
src/core/task.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -237,14 +237,15 @@ int starpu_task_submit(struct starpu_task *task)
 	/* If profiling is activated, we allocate a structure to store the
 	 * appropriate info. */
 	struct starpu_task_profiling_info *info;
-	info = _starpu_allocate_profiling_info_if_needed();
+	int profiling = starpu_profiling_status_get();
+	info = _starpu_allocate_profiling_info_if_needed(task);
 	task->profiling_info = info;
 
 	/* The task is considered as block until we are sure there remains not
 	 * dependency. */
 	task->status = STARPU_TASK_BLOCKED;
 	
-	if (info)
+	if (profiling)
 		starpu_clock_gettime(&info->submit_time);
 
 	/* internally, StarPU manipulates a starpu_job_t which is a wrapper around a

+ 7 - 1
src/core/workers.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -442,6 +442,7 @@ static void _starpu_kill_all_workers(struct starpu_machine_config_s *config)
 
 void starpu_shutdown(void)
 {
+	const char *stats;
 	PTHREAD_MUTEX_LOCK(&init_mutex);
 	init_count--;
 	if (init_count)
@@ -460,6 +461,11 @@ void starpu_shutdown(void)
 #ifdef STARPU_DATA_STATS
 	_starpu_display_comm_amounts();
 #endif
+	if ((stats = getenv("STARPU_BUS_STATS")) && atoi(stats))
+		starpu_bus_profiling_helper_display_summary();
+
+	if ((stats = getenv("STARPU_WORKER_STATS")) && atoi(stats))
+		starpu_worker_profiling_helper_display_summary();
 
 	_starpu_deinitialize_registered_performance_models();
 

+ 4 - 3
src/drivers/cpu/driver_cpu.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -57,12 +57,13 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 	STARPU_TRACE_START_CODELET_BODY(j);
 
 	struct starpu_task_profiling_info *profiling_info;
+	int profiling = starpu_profiling_status_get();
 
 	if (rank == 0)
 	{
 		profiling_info = task->profiling_info;
 	
-		if (profiling_info || calibrate_model)
+		if ((profiling && profiling_info) || calibrate_model)
 		{
 			starpu_clock_gettime(&codelet_start);
 			_starpu_worker_register_executing_start_date(workerid, &codelet_start);
@@ -84,7 +85,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 	{
 		cl->per_worker_stats[workerid]++;
 		
-		if (profiling_info || calibrate_model)
+		if ((profiling && profiling_info) || calibrate_model)
 			starpu_clock_gettime(&codelet_end);
 	
 		STARPU_TRACE_END_CODELET_BODY(j);

+ 4 - 3
src/drivers/cuda/driver_cuda.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -176,9 +176,10 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 	STARPU_TRACE_START_CODELET_BODY(j);
 
 	struct starpu_task_profiling_info *profiling_info;
+	int profiling = starpu_profiling_status_get();
 	profiling_info = task->profiling_info;
 
-	if (profiling_info || calibrate_model)
+	if ((profiling && profiling_info) || calibrate_model)
 	{
 		starpu_clock_gettime(&codelet_start);
 		_starpu_worker_register_executing_start_date(workerid, &codelet_start);
@@ -193,7 +194,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
 	cl->per_worker_stats[workerid]++;
 
-	if (profiling_info || calibrate_model)
+	if ((profiling && profiling_info) || calibrate_model)
 		starpu_clock_gettime(&codelet_end);
 
 	STARPU_TRACE_END_CODELET_BODY(j);	

+ 23 - 9
src/drivers/driver_common/driver_common.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,25 +29,39 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 {
 	struct timespec measured_ts;
 	double measured;
+	int workerid = worker_args->workerid;
 
 	if (profiling_info || calibrate_model)
 	{
 		starpu_timespec_sub(codelet_end, codelet_start, &measured_ts);
 		measured = starpu_timing_timespec_to_us(&measured_ts);
 
-		if (profiling_info)
+		if (starpu_profiling_status_get())
 		{
-			memcpy(&profiling_info->start_time, codelet_start, sizeof(struct timespec));
-			memcpy(&profiling_info->end_time, codelet_end, sizeof(struct timespec));
+			if (profiling_info)
+			{
+				memcpy(&profiling_info->start_time, codelet_start, sizeof(struct timespec));
+				memcpy(&profiling_info->end_time, codelet_end, sizeof(struct timespec));
 
-			int workerid = worker_args->workerid;
-			profiling_info->workerid = workerid;
-			
-			_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1);
+				profiling_info->workerid = workerid;
+				
+				_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1,
+					profiling_info->used_cycles,
+					profiling_info->stall_cycles,
+					profiling_info->power_consumed);
+			}
+		} else {
+			_starpu_worker_update_profiling_info_executing(workerid, 0, 1, 0, 0, 0);
 		}
 
-		if (calibrate_model)
+		if (calibrate_model) {
+			if (profiling_info && profiling_info->power_consumed) {
+				/* TODO: update power model history */
+			}
 			_starpu_update_perfmodel_history(j, perf_arch, worker_args->devid, measured);
+		}
+	} else {
+		_starpu_worker_update_profiling_info_executing(workerid, 0, 1, 0, 0, 0);
 	}
 }
 

+ 4 - 3
src/drivers/opencl/driver_opencl.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -514,9 +514,10 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 	STARPU_TRACE_START_CODELET_BODY(j);
 
 	struct starpu_task_profiling_info *profiling_info;
+	int profiling = starpu_profiling_status_get();
 	profiling_info = task->profiling_info;
 
-	if (profiling_info || calibrate_model)
+	if ((profiling && profiling_info) || calibrate_model)
 	{
 		starpu_clock_gettime(&codelet_start);
 		_starpu_worker_register_executing_start_date(workerid, &codelet_start);
@@ -531,7 +532,7 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
 	cl->per_worker_stats[workerid]++;
 
-	if (profiling_info || calibrate_model)
+	if ((profiling && profiling_info) || calibrate_model)
 		starpu_clock_gettime(&codelet_end);
 
 	STARPU_TRACE_END_CODELET_BODY(j);

+ 46 - 1
src/drivers/opencl/driver_opencl_utils.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +23,7 @@
 #include <sys/types.h>
 
 #include <starpu_opencl.h>
+#include <starpu_profiling.h>
 #include <core/workers.h>
 #include "driver_opencl_utils.h"
 #include "driver_opencl.h"
@@ -195,3 +196,47 @@ cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs
         }
         return CL_SUCCESS;
 }
+
+int starpu_opencl_collect_stats(cl_event event)
+{
+	struct starpu_task *task = starpu_get_current_task();
+	struct starpu_task_profiling_info *info = task->profiling_info;
+
+#ifdef CL_PROFILING_CLOCK_CYCLE_COUNT
+	if (starpu_profiling_status_get() && info) {
+		cl_int err;
+		unsigned int clock_cycle_count;
+		size_t size;
+		err = clGetEventProfilingInfo(event, CL_PROFILING_CLOCK_CYCLE_COUNT, sizeof(clock_cycle_count), &clock_cycle_count, &size);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+		STARPU_ASSERT(size == sizeof(clock_cycle_count));
+		info->used_cycles += clock_cycle_count;
+	}
+#endif
+#ifdef CL_PROFILING_STALL_CYCLE_COUNT
+	if (starpu_profiling_status_get() && info) {
+		cl_int err;
+		unsigned int stall_cycle_count;
+		size_t size;
+		err = clGetEventProfilingInfo(event, CL_PROFILING_STALL_CYCLE_COUNT, sizeof(stall_cycle_count), &stall_cycle_count, &size);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+		STARPU_ASSERT(size == sizeof(stall_cycle_count));
+
+		info->stall_cycles += stall_cycle_count;
+	}
+#endif
+#ifdef CL_PROFILING_POWER_CONSUMED
+	if (info && (starpu_profiling_status_get() || (task->cl && task->cl->model && task->cl->model->benchmarking))) {
+		cl_int err;
+		double power_consumed;
+		size_t size;
+		err = clGetEventProfilingInfo(event, CL_PROFILING_POWER_CONSUMED, sizeof(power_consumed), &power_consumed, &size);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+		STARPU_ASSERT(size == sizeof(power_consumed));
+
+		info->power_consumed += power_consumed;
+	}
+#endif
+
+	return 0;
+}

+ 21 - 6
src/profiling/profiling.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -94,11 +94,14 @@ int starpu_profiling_status_get(void)
 void _starpu_profiling_init(void)
 {
 	int worker;
+	const char *env;
 	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
 	{
 		PTHREAD_MUTEX_INIT(&worker_info_mutex[worker], NULL);
 		_starpu_worker_reset_profiling_info(worker);
 	}
+	if ((env = getenv("STARPU_PROFILING")) && atoi(env))
+		profiling = 1;
 }
 
 void starpu_profiling_terminate(void)
@@ -110,11 +113,12 @@ void starpu_profiling_terminate(void)
  *	Task profiling
  */
 
-struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(void)
+struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(struct starpu_task *task)
 {
 	struct starpu_task_profiling_info *info = NULL;
 
-	if (profiling)
+	/* If we are benchmarking, we need room for the power consumption */
+	if (profiling || (task->cl && task->cl->model && task->cl->model->benchmarking))
 	{
 		info = calloc(1, sizeof(struct starpu_task_profiling_info));
 		STARPU_ASSERT(info);
@@ -139,6 +143,10 @@ static void _do_starpu_worker_reset_profiling_info(int workerid)
 	starpu_timespec_clear(&worker_info[workerid].sleeping_time);
 
 	worker_info[workerid].executed_tasks = 0;
+
+	worker_info[workerid].used_cycles = 0;
+	worker_info[workerid].stall_cycles = 0;
+	worker_info[workerid].power_consumed = 0;
 	
 	/* We detect if the worker is already sleeping or doing some
 	 * computation */
@@ -221,7 +229,7 @@ void _starpu_worker_update_profiling_info_sleeping(int workerid, struct timespec
 }
 
 
-void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks)
+void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double power_consumed)
 {
 	if (profiling)
 	{
@@ -229,16 +237,23 @@ void _starpu_worker_update_profiling_info_executing(int workerid, struct timespe
 
 		starpu_timespec_accumulate(&worker_info[workerid].executing_time, executing_time);
 
+		worker_info[workerid].used_cycles += used_cycles;
+		worker_info[workerid].stall_cycles += stall_cycles;
+		worker_info[workerid].power_consumed += power_consumed;
 		worker_info[workerid].executed_tasks += executed_tasks;
 	
 		PTHREAD_MUTEX_UNLOCK(&worker_info_mutex[workerid]);
-	}
+	} else /* Not thread safe, shouldn't be too much a problem */
+		worker_info[workerid].executed_tasks += executed_tasks;
 }
 
 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *info)
 {
 	if (!profiling)
-		return -EINVAL;
+	{
+		/* Not thread safe, shouldn't be too much a problem */
+		info->executed_tasks = worker_info[workerid].executed_tasks;
+	}
 
 	PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
 

+ 3 - 3
src/profiling/profiling.h

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,14 +24,14 @@
 
 /* Create a task profiling info structure (with the proper time stamps) in case
  * profiling is enabled. */
-struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(void);
+struct starpu_task_profiling_info *_starpu_allocate_profiling_info_if_needed(struct starpu_task *task);
 
 /* Clear all the profiling info related to the worker. */
 void _starpu_worker_reset_profiling_info(int workerid);
 
 /* Update the per-worker profiling info after a task (or more) was executed.
  * This tells StarPU how much time was spent doing computation. */
-void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks);
+void _starpu_worker_update_profiling_info_executing(int workerid, struct timespec *executing_time, int executed_tasks, uint64_t used_cycles, uint64_t stall_cycles, double consumed_power);
 
 /* Update the per-worker profiling info when StarPU wakes up: this indicates
  * how much time was spent sleeping. */

+ 38 - 2
src/profiling/bus_profiling_helpers.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,7 +22,8 @@ void starpu_bus_profiling_helper_display_summary(void)
 {
 	int long long sum_transferred = 0;
 
-	fprintf(stderr, "Data transfer statistics:\n");
+	fprintf(stderr, "\nData transfer statistics:\n");
+	fprintf(stderr,   "*************************\n");
 
 	int busid;
 	int bus_cnt = starpu_bus_get_count();
@@ -47,3 +48,38 @@ void starpu_bus_profiling_helper_display_summary(void)
 
 	fprintf(stderr, "Total transfers: %.2lf MB\n", (1.0*sum_transferred)/(1024*1024));
 }
+
+void starpu_worker_profiling_helper_display_summary(void)
+{
+	double sum_consumed = 0.;
+	int profiling = starpu_profiling_status_get();
+	fprintf(stderr, "\nWorker statistics:\n");
+	fprintf(stderr,   "******************\n");
+
+	int workerid;
+	int worker_cnt = starpu_worker_get_count();
+	for (workerid = 0; workerid < worker_cnt; workerid++)
+	{
+		struct starpu_worker_profiling_info info;
+		starpu_worker_get_profiling_info(workerid, &info);
+		char name[32];
+
+		starpu_worker_get_name(workerid, name, sizeof(name));
+
+		if (profiling) {
+			double total_time = starpu_timing_timespec_to_us(&info.total_time) / 1000.;
+			double executing_time = starpu_timing_timespec_to_us(&info.executing_time) / 1000.;
+			double sleeping_time = starpu_timing_timespec_to_us(&info.sleeping_time) / 1000.;
+
+			fprintf(stderr, "%-32s\n", name);
+			fprintf(stderr, "\t%d task(s)\n\ttotal: %.2lf ms executing: %.2lf ms sleeping: %.2lf\n\t%lu Mcy %lu Mcy stall\n\t%lf J consumed\n", info.executed_tasks, total_time, executing_time, sleeping_time, info.used_cycles/1000000, info.stall_cycles/1000000, info.power_consumed);
+		} else {
+			fprintf(stderr, "\t%-32s\tapproximately %d task(s)\n", name, info.executed_tasks);
+		}
+
+		sum_consumed += info.power_consumed;
+	}
+
+	if (profiling)
+		fprintf(stderr, "Total consumption: %.2lf J\n", sum_consumed);
+}

+ 9 - 4
tests/datawizard/sync_and_notify_data_opencl.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
+ * Copyright (C) Université Bordeaux 1, CNRS 2008-2011 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,6 +25,7 @@ void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
         unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 	int id, devid, err;
 
 	id = starpu_worker_get_id();
@@ -40,12 +41,13 @@ void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
 	{
 		size_t global=100;
 		size_t local=100;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
-
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 	starpu_opencl_release_kernel(kernel);
 }
 
@@ -54,6 +56,7 @@ void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
 	unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
+	cl_event event;
 	int id, devid, err;
 
 	id = starpu_worker_get_id();
@@ -69,11 +72,13 @@ void opencl_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args)
 	{
 		size_t global=100;
 		size_t local=100;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 
 	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
 
 	starpu_opencl_release_kernel(kernel);
 }