13 年之前 · 16f070b7cd
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -517,6 +517,7 @@ so:
 
				 * Hello World::                 Submitting Tasks
			
 
				 * Scaling a Vector::            Manipulating Data
			
 
				 * Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
			
 
				+* Using multiple implentations of a codelet::
			
 
				 * Task and Worker Profiling::   
			
 
				 * Partitioning Data::           Partitioning Data
			
 
				 * Performance model example::   
			
@@ -1141,6 +1142,50 @@ see @ref{Enabling OpenCL}):
 
				 0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				 @end smallexample
			
 
				 
			
 
				+@node Using multiple implentations of a codelet
			
 
				+@section Using multiple implentations of a codelet
			
 
				+One may want to write multiple implementations of a codelet for a single type of
			
 
				+device and let StarPU choose which one to run. As an example, we will show how
			
 
				+to use SSE to scale a vector. The codelet can be written as follows :
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <xmmintrin.h>
			
 
				+
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+	unsigned int n_iterations = n/4;
			
 
				+	if (n % 4 != 0)
			
 
				+		n_iterations++;
			
 
				+
			
 
				+	__m128 *VECTOR = (__m128*) vector;
			
 
				+	__m128 factor __attribute__((aligned(16)));
			
 
				+	factor = _mm_set1_ps(*(float *) cl_arg);
			
 
				+
			
 
				+	unsigned int i;	
			
 
				+	for (i = 0; i < n_iterations; i++)
			
 
				+		VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The @code{cpu_func} field of the @code{starpu_codelet} structure has to be set
			
 
				+to the special value @code{STARPU_MULTIPLE_CPU_IMPLEMENTATIONS}. Note that
			
 
				+@code{STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS} and
			
 
				+@code{STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS} are also available.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_codelet cl = @{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
			
 
				+	.cpu_funcs = @{ scal_cpu_func, scal_sse_func @},
			
 
				+	.nbuffers = 1
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 @node Task and Worker Profiling
			
 
				 @section Task and Worker Profiling
			
 
				 
			
@@ -2616,6 +2661,7 @@ Enable flags for the @code{gcov} coverage tool.
 
				 * --with-opencl-lib-dir::           
			
 
				 * --enable-gordon::             
			
 
				 * --with-gordon-dir::           
			
 
				+* --enable-maximplementations::
			
 
				 @end menu
			
 
				 
			
 
				 @node --enable-maxcpus
			
@@ -2739,6 +2785,14 @@ Enable the use of the Gordon runtime for Cell SPUs.
 
				 Specify the location of the Gordon SDK.
			
 
				 @end table
			
 
				 
			
 
				+@node --enable-maximplementations
			
 
				+@subsubsection @code{--enable-maximplementations=<number>}
			
 
				+@table @asis
			
 
				+@item @emph{Description}:
			
 
				+Defines the number of implementations that can be defined for a single kind of
			
 
				+device. It is then available as the @code{STARPU_MAXIMPLEMENTATIONS} macro.
			
 
				+@end table
			
 
				+
			
 
				 @node Advanced configuration
			
 
				 @subsection Advanced configuration
			
 
				 
			
--- a/doc/vector_scal_c.texi
+++ b/doc/vector_scal_c.texi
@@ -11,13 +11,15 @@
 
				 #define    NX    2048
			
 
				 
			
 
				 extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				+extern void scal_sse_func(void *buffers[], void *_args);
			
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				 extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				 
			
 
				 static starpu_codelet cl = @{
			
 
				     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
			
 
				     /* CPU implementation of the codelet */
			
 
				-    .cpu_func = scal_cpu_func,
			
 
				+    .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
			
 
				+    .cpu_funcs = @{ scal_cpu_func, scal_sse_func @},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				     /* CUDA implementation of the codelet */
			
 
				     .cuda_func = scal_cuda_func,
			
--- a/doc/vector_scal_cpu.texi
+++ b/doc/vector_scal_cpu.texi
@@ -1,4 +1,5 @@
 
				 #include <starpu.h>
			
 
				+#include <xmmintrin.h>
			
 
				 
			
 
				 /* This kernel takes a buffer and scales it by a constant factor */
			
 
				 void scal_cpu_func(void *buffers[], void *cl_arg)
			
@@ -30,3 +31,20 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 
				     for (i = 0; i < n; i++)
			
 
				         val[i] *= *factor;
			
 
				 @}
			
 
				+
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+	unsigned int n_iterations = n/4;
			
 
				+	if (n % 4 != 0)
			
 
				+		n_iterations++;
			
 
				+
			
 
				+	__m128 *VECTOR = (__m128*) vector;
			
 
				+	__m128 factor __attribute__((aligned(16)));
			
 
				+	factor = _mm_set1_ps(*(float *) cl_arg);
			
 
				+
			
 
				+	unsigned int i;	
			
 
				+	for (i = 0; i < n_iterations; i++)
			
 
				+		VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
			
 
				+@}