15 年之前 · 3bfb66c3f8
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -1963,7 +1963,7 @@ The definition of the codelet can be written as follows:
 
				 
			
 
				 @cartouche
			
 
				 @smallexample
			
 
				-void scal_func(void *buffers[], void *cl_arg)
			
 
				+void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				 @{
			
 
				     unsigned i;
			
 
				     float *factor = cl_arg;
			
@@ -1981,13 +1981,13 @@ void scal_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 starpu_codelet cl = @{
			
 
				     .where = STARPU_CPU,
			
 
				-    .cpu_func = scal_func,
			
 
				+    .cpu_func = scal_cpu_func,
			
 
				     .nbuffers = 1
			
 
				 @};
			
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				-The second argument of the @code{scal_func} function contains a pointer to the
			
 
				+The second argument of the @code{scal_cpu_func} function contains a pointer to the
			
 
				 parameters of the codelet (given in @code{task->cl_arg}), so that we read the
			
 
				 constant factor from this pointer. The first argument is an array that gives
			
 
				 a description of all the buffers passed in the @code{task->buffers}@ array. The
			
@@ -2052,9 +2052,9 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				     float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
			
 
				 
			
 
				     /* TODO: use more blocks and threads in blocks */
			
 
				-    vector_mult_cuda<<<1,1>>>(val, n, *factor);
			
 
				+@i{    vector_mult_cuda<<<1,1>>>(val, n, *factor);}
			
 
				 
			
 
				-    cudaThreadSynchronize();
			
 
				+@i{    cudaThreadSynchronize();}
			
 
				 @}
			
 
				 @end smallexample
			
 
				 @end cartouche
			
@@ -2080,46 +2080,46 @@ __kernel void vector_mult_opencl(__global float* val, int nx, float factor)
 
				 @cartouche
			
 
				 @smallexample
			
 
				 #include <starpu.h>
			
 
				-#include <starpu_opencl.h>
			
 
				+@i{#include <starpu_opencl.h>}
			
 
				 
			
 
				-extern struct starpu_opencl_codelet codelet;
			
 
				+@i{extern struct starpu_opencl_codelet codelet;}
			
 
				 
			
 
				 void scal_opencl_func(void *buffers[], void *_args)
			
 
				 @{
			
 
				     float *factor = (float *)_args;
			
 
				     struct starpu_vector_interface_s *vector = (struct starpu_vector_interface_s *) buffers[0];
			
 
				-    int id, devid, err;
			
 
				-    cl_kernel kernel;
			
 
				-    cl_command_queue queue;
			
 
				+@i{    int id, devid, err;}
			
 
				+@i{    cl_kernel kernel;}
			
 
				+@i{    cl_command_queue queue;}
			
 
				 
			
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_GET_VECTOR_NX(vector);
			
 
				     /* local copy of the vector pointer */
			
 
				     float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
			
 
				 
			
 
				-    id = starpu_worker_get_id();
			
 
				-    devid = starpu_worker_get_devid(id);
			
 
				+@i{    id = starpu_worker_get_id();}
			
 
				+@i{    devid = starpu_worker_get_devid(id);}
			
 
				 
			
 
				-    err = starpu_opencl_load_kernel(&kernel, &queue, &codelet,
			
 
				-                    "vector_mult_opencl", devid);   /* @b{Name of the codelet defined above} */
			
 
				-    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+@i{    err = starpu_opencl_load_kernel(&kernel, &queue, &codelet,}
			
 
				+@i{                    "vector_mult_opencl", devid);   /* @b{Name of the codelet defined above} */}
			
 
				+@i{    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				 
			
 
				-    err = 0;
			
 
				-    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				-    err = clSetKernelArg(kernel, 1, sizeof(int), &n);
			
 
				-    err |= clSetKernelArg(kernel, 2, sizeof(float), (void*)factor);
			
 
				-    if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+@i{    err = 0;}
			
 
				+@i{    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);}
			
 
				+@i{    err = clSetKernelArg(kernel, 1, sizeof(int), &n);}
			
 
				+@i{    err |= clSetKernelArg(kernel, 2, sizeof(float), (void*)factor);}
			
 
				+@i{    if (err) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				 
			
 
				-    @{
			
 
				-        size_t global=1;
			
 
				-        size_t local=1;
			
 
				-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-    @}
			
 
				+@i{    @{}
			
 
				+@i{        size_t global=1;}
			
 
				+@i{        size_t local=1;}
			
 
				+@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);}
			
 
				+@i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				+@i{    @}}
			
 
				 
			
 
				-    clFinish(queue);
			
 
				+@i{    clFinish(queue);}
			
 
				 
			
 
				-    starpu_opencl_release(kernel);
			
 
				+@i{    starpu_opencl_release_kernel(kernel);}
			
 
				 @}
			
 
				 @end smallexample
			
 
				 @end cartouche
			
@@ -2128,29 +2128,28 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 @node Definition of the Main Code
			
 
				 @subsection Definition of the Main Code
			
 
				 
			
 
				-
			
 
				 The CPU implementation is the same as in the previous section.
			
 
				 
			
 
				 Here is the source of the main application. You can notice the value of the
			
 
				 field @code{where} for the codelet. We specify
			
 
				-@code{STARPU_CPU|STARPU_CUDA} to indicate to StarPU that the codelet
			
 
				-can be executed either on a CPU or on a CUDA device.
			
 
				+@code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codelet
			
 
				+can be executed either on a CPU or on a CUDA or an OpenCL device.
			
 
				 
			
 
				 @cartouche
			
 
				 @smallexample
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#define NX 5
			
 
				+#define NX 2048
			
 
				 
			
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				-extern void scal_func(void *buffers[], void *_args);
			
 
				+extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				 
			
 
				 /* @b{Definition of the codelet} */
			
 
				 static starpu_codelet cl = @{
			
 
				     .where = STARPU_CPU|STARPU_CUDA; /* @b{It can be executed on a CPU} */
			
 
				                                      /* @b{or on a CUDA device} */
			
 
				     .cuda_func = scal_cuda_func;
			
 
				-    .cpu_func = scal_func;
			
 
				+    .cpu_func = scal_cpu_func;
			
 
				     .nbuffers = 1;
			
 
				 @}
			
 
				 
			
@@ -2289,6 +2288,7 @@ or by disabling CUDA devices:
 
				 
			
 
				 @menu
			
 
				 * Main application::            
			
 
				+* CPU Codelet::                 
			
 
				 * CUDA Codelet::                
			
 
				 * OpenCL Codelet::              
			
 
				 @end menu
			
@@ -2300,6 +2300,13 @@ or by disabling CUDA devices:
 
				 @include vector_scal_c.texi
			
 
				 @end smallexample
			
 
				 
			
 
				+@node CPU Codelet
			
 
				+@section CPU Codelet
			
 
				+
			
 
				+@smallexample
			
 
				+@include vector_scal_cpu.texi
			
 
				+@end smallexample
			
 
				+
			
 
				 @node CUDA Codelet
			
 
				 @section CUDA Codelet
			
 
				 
			
--- a/doc/vector_scal_c.texi
+++ b/doc/vector_scal_c.texi
@@ -30,38 +30,7 @@
 
				 
			
 
				 #define    NX    2048
			
 
				 
			
 
				-/* This kernel takes a buffer and scales it by a constant factor */
			
 
				-static void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				-@{
			
 
				-    unsigned i;
			
 
				-    float *factor = cl_arg;
			
 
				-
			
 
				-    /* 
			
 
				-     * The "buffers" array matches the task->buffers array: for instance
			
 
				-     * task->buffers[0].handle is a handle that corresponds to a data with
			
 
				-     * vector "interface", so that the first entry of the array in the
			
 
				-     * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				-     * struct starpu_vector_interface_s *). Here, we therefore manipulate
			
 
				-     * the buffers[0] element as a vector: nx gives the number of elements
			
 
				-     * in the array, ptr gives the location of the array (that was possibly
			
 
				-     * migrated/replicated), and elemsize gives the size of each elements.
			
 
				-     */
			
 
				-
			
 
				-    starpu_vector_interface_t *vector = buffers[0];
			
 
				-
			
 
				-    /* length of the vector */
			
 
				-    unsigned n = STARPU_GET_VECTOR_NX(vector);
			
 
				-
			
 
				-    /* get a pointer to the local copy of the vector : note that we have to
			
 
				-     * cast it in (float *) since a vector could contain any type of
			
 
				-     * elements so that the .ptr field is actually a uintptr_t */
			
 
				-    float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
			
 
				-
			
 
				-    /* scale the vector */
			
 
				-    for (i = 0; i < n; i++)
			
 
				-        val[i] *= *factor;
			
 
				-@}
			
 
				-
			
 
				+extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				 extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				 
			
--- a/doc/vector_scal_cpu.texi
+++ b/doc/vector_scal_cpu.texi
@@ -0,0 +1,53 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example complements vector_scale.c: here we implement a CPU version.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+/* This kernel takes a buffer and scales it by a constant factor */
			
 
				+void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    unsigned i;
			
 
				+    float *factor = cl_arg;
			
 
				+
			
 
				+    /* 
			
 
				+     * The "buffers" array matches the task->buffers array: for instance
			
 
				+     * task->buffers[0].handle is a handle that corresponds to a data with
			
 
				+     * vector "interface", so that the first entry of the array in the
			
 
				+     * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				+     * struct starpu_vector_interface_s *). Here, we therefore manipulate
			
 
				+     * the buffers[0] element as a vector: nx gives the number of elements
			
 
				+     * in the array, ptr gives the location of the array (that was possibly
			
 
				+     * migrated/replicated), and elemsize gives the size of each elements.
			
 
				+     */
			
 
				+
			
 
				+    starpu_vector_interface_t *vector = buffers[0];
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_GET_VECTOR_NX(vector);
			
 
				+
			
 
				+    /* get a pointer to the local copy of the vector : note that we have to
			
 
				+     * cast it in (float *) since a vector could contain any type of
			
 
				+     * elements so that the .ptr field is actually a uintptr_t */
			
 
				+    float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
			
 
				+
			
 
				+    /* scale the vector */
			
 
				+    for (i = 0; i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+@}