12 years ago · 7dce77ea0e
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -40,8 +40,14 @@ chapters =	\
 
				 	chapters/environment_variables.doxy \
			
 
				 	chapters/configure_options.doxy \
			
 
				 	chapters/fdl-1.3.doxy \
			
 
				+	chapters/scaling-vector-example.doxy \
			
 
				 	chapters/hello_pragma2.c \
			
 
				 	chapters/hello_pragma.c \
			
 
				+	chapters/vector_scal_c.c \
			
 
				+	chapters/vector_scal_cpu.c \
			
 
				+	chapters/vector_scal_cuda.cu \
			
 
				+	chapters/vector_scal_opencl.c \
			
 
				+	chapters/vector_scal_opencl_codelet.cl \
			
 
				 	chapters/api/codelet_and_tasks.doxy \
			
 
				 	chapters/api/cuda_extensions.doxy \
			
 
				 	chapters/api/data_interfaces.doxy \
			
--- a/doc/doxygen/chapters/scaling-vector-example.doxy
+++ b/doc/doxygen/chapters/scaling-vector-example.doxy
@@ -0,0 +1,34 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page FullSourceCodeVectorScal Full source code for the ’Scaling a Vector’ example
			
 
				+
			
 
				+\section MainApplication Main Application
			
 
				+
			
 
				+\include vector_scal_c.c
			
 
				+
			
 
				+\section CPUKernel CPU Kernel
			
 
				+
			
 
				+\include vector_scal_cpu.c
			
 
				+
			
 
				+\section CUDAKernel CUDA Kernel
			
 
				+
			
 
				+\include vector_scal_cuda.cu
			
 
				+
			
 
				+\section OpenCLKernel OpenCL Kernel
			
 
				+
			
 
				+\subsection InvokingtheKernel Invoking the Kernel
			
 
				+
			
 
				+\include vector_scal_opencl.c
			
 
				+
			
 
				+\subsection SourceoftheKernel Source of the Kernel
			
 
				+
			
 
				+\include vector_scal_opencl_codelet.cl
			
 
				+
			
 
				+*/
			
 
				+
			
--- a/doc/doxygen/chapters/vector_scal_c.c
+++ b/doc/doxygen/chapters/vector_scal_c.c
@@ -0,0 +1,127 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example demonstrates how to use StarPU to scale an array by a factor.
			
 
				+ * It shows how to manipulate data with StarPU's data management library.
			
 
				+ *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
			
 
				+ *  2- how to describe which data are accessed by a task (task->handles[0])
			
 
				+ *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define    NX    2048
			
 
				+
			
 
				+extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				+extern void scal_sse_func(void *buffers[], void *_args);
			
 
				+extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				+extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				+
			
 
				+static struct starpu_codelet cl = {
			
 
				+    .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
			
 
				+    /* CPU implementation of the codelet */
			
 
				+    .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL },
			
 
				+    .cpu_funcs_name = { "scal_cpu_func", "scal_sse_func", NULL },
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+    /* CUDA implementation of the codelet */
			
 
				+    .cuda_funcs = { scal_cuda_func, NULL },
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    /* OpenCL implementation of the codelet */
			
 
				+    .opencl_funcs = { scal_opencl_func, NULL },
			
 
				+#endif
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = { STARPU_RW }
			
 
				+};
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+struct starpu_opencl_program programs;
			
 
				+#endif
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+    /* We consider a vector of float that is initialized just as any of C
			
 
				+      * data */
			
 
				+    float vector[NX];
			
 
				+    unsigned i;
			
 
				+    for (i = 0; i < NX; i++)
			
 
				+        vector[i] = 1.0f;
			
 
				+
			
 
				+    fprintf(stderr, "BEFORE: First element was %f\n", vector[0]);
			
 
				+
			
 
				+    /* Initialize StarPU with default configuration */
			
 
				+    starpu_init(NULL);
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        starpu_opencl_load_opencl_from_file(
			
 
				+               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
			
 
				+#endif
			
 
				+
			
 
				+    /* Tell StaPU to associate the "vector" vector with the "vector_handle"
			
 
				+     * identifier. When a task needs to access a piece of data, it should
			
 
				+     * refer to the handle that is associated to it.
			
 
				+     * In the case of the "vector" data interface:
			
 
				+     *  - the first argument of the registration method is a pointer to the
			
 
				+     *    handle that should describe the data
			
 
				+     *  - the second argument is the memory node where the data (ie. "vector")
			
 
				+     *    resides initially: 0 stands for an address in main memory, as
			
 
				+     *    opposed to an adress on a GPU for instance.
			
 
				+     *  - the third argument is the adress of the vector in RAM
			
 
				+     *  - the fourth argument is the number of elements in the vector
			
 
				+     *  - the fifth argument is the size of each element.
			
 
				+     */
			
 
				+    starpu_data_handle_t vector_handle;
			
 
				+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
			
 
				+                                NX, sizeof(vector[0]));
			
 
				+
			
 
				+    float factor = 3.14;
			
 
				+
			
 
				+    /* create a synchronous task: any call to starpu_task_submit will block
			
 
				+      * until it is terminated */
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+    task->synchronous = 1;
			
 
				+
			
 
				+    task->cl = &cl;
			
 
				+
			
 
				+    /* the codelet manipulates one buffer in RW mode */
			
 
				+    task->handles[0] = vector_handle;
			
 
				+
			
 
				+    /* an argument is passed to the codelet, beware that this is a
			
 
				+     * READ-ONLY buffer and that the codelet may be given a pointer to a
			
 
				+     * COPY of the argument */
			
 
				+    task->cl_arg = &factor;
			
 
				+    task->cl_arg_size = sizeof(factor);
			
 
				+
			
 
				+    /* execute the task on any eligible computational ressource */
			
 
				+    starpu_task_submit(task);
			
 
				+
			
 
				+    /* StarPU does not need to manipulate the array anymore so we can stop
			
 
				+      * monitoring it */
			
 
				+    starpu_data_unregister(vector_handle);
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    starpu_opencl_unload_opencl(&programs);
			
 
				+#endif
			
 
				+
			
 
				+    /* terminate StarPU, no task can be submitted after */
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    fprintf(stderr, "AFTER First element is %f\n", vector[0]);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
--- a/doc/doxygen/chapters/vector_scal_cpu.c
+++ b/doc/doxygen/chapters/vector_scal_cpu.c
@@ -0,0 +1,75 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <xmmintrin.h>
			
 
				+
			
 
				+/* This kernel takes a buffer and scales it by a constant factor */
			
 
				+void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+    unsigned i;
			
 
				+    float *factor = cl_arg;
			
 
				+
			
 
				+    /*
			
 
				+     * The "buffers" array matches the task->handles array: for instance
			
 
				+     * task->handles[0] is a handle that corresponds to a data with
			
 
				+     * vector "interface", so that the first entry of the array in the
			
 
				+     * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				+     * struct starpu_vector_interface *). Here, we therefore manipulate
			
 
				+     * the buffers[0] element as a vector: nx gives the number of elements
			
 
				+     * in the array, ptr gives the location of the array (that was possibly
			
 
				+     * migrated/replicated), and elemsize gives the size of each elements.
			
 
				+     */
			
 
				+    struct starpu_vector_interface *vector = buffers[0];
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+
			
 
				+    /* get a pointer to the local copy of the vector: note that we have to
			
 
				+     * cast it in (float *) since a vector could contain any type of
			
 
				+     * elements so that the .ptr field is actually a uintptr_t */
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+    /* scale the vector */
			
 
				+    for (i = 0; i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+}
			
 
				+
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    unsigned int n_iterations = n/4;
			
 
				+
			
 
				+    __m128 *VECTOR = (__m128*) vector;
			
 
				+    __m128 FACTOR __attribute__((aligned(16)));
			
 
				+    float factor = *(float *) cl_arg;
			
 
				+    FACTOR = _mm_set1_ps(factor);
			
 
				+
			
 
				+    unsigned int i;
			
 
				+    for (i = 0; i < n_iterations; i++)
			
 
				+        VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
			
 
				+
			
 
				+    unsigned int remainder = n%4;
			
 
				+    if (remainder != 0)
			
 
				+    {
			
 
				+        unsigned int start = 4 * n_iterations;
			
 
				+        for (i = start; i < start+remainder; ++i)
			
 
				+        {
			
 
				+            vector[i] = factor * vector[i];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/doc/doxygen/chapters/vector_scal_cuda.cu
+++ b/doc/doxygen/chapters/vector_scal_cuda.cu
@@ -0,0 +1,44 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+static __global__ void vector_mult_cuda(unsigned n, float *val,
			
 
				+                                        float factor)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+        if (i < n)
			
 
				+               val[i] *= factor;
			
 
				+}
			
 
				+
			
 
				+extern "C" void scal_cuda_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+        float *factor = (float *)_args;
			
 
				+
			
 
				+        /* length of the vector */
			
 
				+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+        /* local copy of the vector pointer */
			
 
				+        float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+        unsigned threads_per_block = 64;
			
 
				+        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+
			
 
				+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
			
 
				+	                (n, val, *factor);
			
 
				+
			
 
				+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
--- a/doc/doxygen/chapters/vector_scal_cuda.texi
+++ b/doc/doxygen/chapters/vector_scal_cuda.texi
@@ -0,0 +1,26 @@
 
				+#include <starpu.h>
			
 
				+
			
 
				+static __global__ void vector_mult_cuda(unsigned n, float *val,
			
 
				+                                        float factor)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+        if (i < n)
			
 
				+               val[i] *= factor;
			
 
				+}
			
 
				+
			
 
				+extern "C" void scal_cuda_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+        float *factor = (float *)_args;
			
 
				+
			
 
				+        /* length of the vector */
			
 
				+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+        /* local copy of the vector pointer */
			
 
				+        float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+        unsigned threads_per_block = 64;
			
 
				+        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+
			
 
				+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
			
 
				+	                (n, val, *factor);
			
 
				+
			
 
				+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/doc/doxygen/chapters/vector_scal_opencl.c
+++ b/doc/doxygen/chapters/vector_scal_opencl.c
@@ -0,0 +1,70 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
			
 
				+ * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+extern struct starpu_opencl_program programs;
			
 
				+
			
 
				+void scal_opencl_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+    float *factor = _args;
			
 
				+    int id, devid, err;
			
 
				+    cl_kernel kernel;
			
 
				+    cl_command_queue queue;
			
 
				+    cl_event event;
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    /* OpenCL copy of the vector pointer */
			
 
				+    cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				+
			
 
				+    id = starpu_worker_get_id();
			
 
				+    devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+    err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl",
			
 
				+                                    devid);
			
 
				+    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+    err = clSetKernelArg(kernel, 0, sizeof(n), &n);
			
 
				+    err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
			
 
				+    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				+    if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+    {
			
 
				+        size_t global=n;
			
 
				+        size_t local;
			
 
				+        size_t s;
			
 
				+        cl_device_id device;
			
 
				+
			
 
				+        starpu_opencl_get_device(devid, &device);
			
 
				+        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+                                        sizeof(local), &local, &s);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+        if (local > global) local=global;
			
 
				+
			
 
				+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
			
 
				+                                     NULL, &event);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+    }
			
 
				+
			
 
				+    clFinish(queue);
			
 
				+    starpu_opencl_collect_stats(event);
			
 
				+    clReleaseEvent(event);
			
 
				+
			
 
				+    starpu_opencl_release_kernel(kernel);
			
 
				+}
			
--- a/doc/doxygen/chapters/vector_scal_opencl_codelet.cl
+++ b/doc/doxygen/chapters/vector_scal_opencl_codelet.cl
@@ -0,0 +1,23 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i < nx) {
			
 
				+                val[i] *= factor;
			
 
				+        }
			
 
				+}
			
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -223,14 +223,14 @@ Documentation License”.
 
				 
			
 
				 \part{Appendix}
			
 
				 
			
 
				+\chapter{Full Source Code for the ’Scaling a Vector’ Example}
			
 
				+\label{FullSourceCodeVectorScal}
			
 
				+\hypertarget{FullSourceCodeVectorScal}{}
			
 
				+\input{FullSourceCodeVectorScal}
			
 
				+
			
 
				 \chapter{GNU Free Documentation License}
			
 
				 \label{GNUFreeDocumentationLicense}
			
 
				 \hypertarget{GNUFreeDocumentationLicense}{}
			
 
				 \input{GNUFreeDocumentationLicense}
			
 
				 
			
 
				 \end{document}
			
 
				-
			
 
				-\chapter{}
			
 
				-\label{}
			
 
				-\hypertarget{}{}
			
 
				-\input{}