12 years ago · 5688c8bee4
--- a/doc/tutorial/Makefile
+++ b/doc/tutorial/Makefile
@@ -24,7 +24,7 @@ HAS_OPENCL	=	$(shell pkg-config --libs starpu-1.1 |grep -i opencl)
 
				 %.o: %.cu
			
 
				 	nvcc $(CFLAGS) $< -c
			
 
				 
			
 
				-TARGETS = hello_world vector_scal hello_world_plugin
			
 
				+TARGETS = hello_world vector_scal hello_world_plugin vector_scal_plugin
			
 
				 
			
 
				 all: $(TARGETS)
			
 
				 
			
@@ -45,5 +45,19 @@ vector_scal: $(VECTOR_SCAL_PREREQUISITES)
 
				 hello_world_plugin: hello_world_plugin.c
			
 
				 	$(CC) $(CFLAGS) -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` $(LDFLAGS) $^ -o $@
			
 
				 
			
 
				+VECTOR_SCAL_PLUGIN_PREREQUISITES	=	vector_scal_plugin.o
			
 
				+ifneq ($(strip $(HAS_CUDA)),)
			
 
				+VECTOR_SCAL_PLUGIN_PREREQUISITES	+=	vector_scal_plugin_cuda.o
			
 
				+VECTOR_SCAL_PLUGIN_COMPILER		=	$(NVCC)
			
 
				+else
			
 
				+VECTOR_SCAL_PLUGIN_COMPILER		=	$(CC)
			
 
				+endif
			
 
				+
			
 
				+vector_scal_plugin.o: vector_scal_plugin.c
			
 
				+	$(CC) -c $(CFLAGS) -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` $^ -o $@
			
 
				+
			
 
				+vector_scal_plugin: $(VECTOR_SCAL_PLUGIN_PREREQUISITES)
			
 
				+	$(CC) -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` $(LDFLAGS) $(VECTOR_SCAL_PLUGIN_PREREQUISITES) -o $@
			
 
				+
			
 
				 clean:
			
 
				 	rm -f $(TARGETS) *.o
			
--- a/doc/tutorial/README
+++ b/doc/tutorial/README
@@ -47,3 +47,7 @@ Instructions on how to compile and run StarPU examples
 
				 % make hello_world_plugin
			
 
				 % ./hello_world_plugin
			
 
				 
			
 
				+% make vector_scal_plugin
			
 
				+% STARPU_NCPU=0 STARPU_NCUDA=0 ./vector_scal_plugin
			
 
				+% STARPU_NCPU=0 STARPU_NOPENCL=0 ./vector_scal_plugin
			
 
				+% STARPU_NOPENCL=0 STARPU_NCUDA=0 ./vector_scal_plugin
			
--- a/doc/tutorial/vector_scal_plugin.c
+++ b/doc/tutorial/vector_scal_plugin.c
@@ -0,0 +1,194 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* This example showcases features of the StarPU GCC plug-in.  It defines a
			
 
				+   "vector scaling" task with multiple CPU implementations, an OpenCL
			
 
				+   implementation, and a CUDA implementation.
			
 
				+
			
 
				+   Compiling it without `-fplugin=starpu.so' yields valid sequential code.  */
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <stdbool.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+
			
 
				+/* Declare and define the standard CPU implementation.  */
			
 
				+
			
 
				+static void vector_scal (unsigned int size, float vector[size], float factor)
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+/* The CPU implementation.  */
			
 
				+static void
			
 
				+vector_scal (unsigned int size, float vector[size], float factor)
			
 
				+{
			
 
				+  unsigned int i;
			
 
				+  for (i = 0; i < size; i++)
			
 
				+    vector[i] *= factor;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#if defined STARPU_GCC_PLUGIN && defined __SSE__
			
 
				+/* The SSE-capable CPU implementation.  */
			
 
				+
			
 
				+#include <xmmintrin.h>
			
 
				+
			
 
				+static void vector_scal_sse (unsigned int size, float vector[size], float factor)
			
 
				+  __attribute__ ((task_implementation ("cpu", vector_scal)));
			
 
				+
			
 
				+static void
			
 
				+vector_scal_sse (unsigned int size, float vector[size], float factor)
			
 
				+{
			
 
				+  unsigned int n_iterations = size / 4;
			
 
				+
			
 
				+  __m128 *VECTOR = (__m128 *) vector;
			
 
				+  __m128 _FACTOR __attribute__ ((aligned (16)));
			
 
				+  _FACTOR = _mm_set1_ps (factor);
			
 
				+
			
 
				+  unsigned int i;
			
 
				+  for (i = 0; i < n_iterations; i++)
			
 
				+    VECTOR[i] = _mm_mul_ps (_FACTOR, VECTOR[i]);
			
 
				+
			
 
				+  unsigned int remainder = size % 4;
			
 
				+  if (remainder != 0)
			
 
				+    {
			
 
				+      unsigned int start = 4 * n_iterations;
			
 
				+      for (i = start; i < start + remainder; ++i)
			
 
				+	vector[i] = factor * vector[i];
			
 
				+    }
			
 
				+}
			
 
				+#endif /* __SSE__ */
			
 
				+
			
 
				+
			
 
				+/* Declaration and definition of the OpenCL implementation.  */
			
 
				+
			
 
				+#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
			
 
				+
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+/* The OpenCL programs, loaded from `main'.  */
			
 
				+static struct starpu_opencl_program cl_programs;
			
 
				+
			
 
				+static void vector_scal_opencl (unsigned int size, float vector[size], float factor)
			
 
				+  __attribute__ ((task_implementation ("opencl", vector_scal)));
			
 
				+
			
 
				+static void
			
 
				+vector_scal_opencl (unsigned int size, float vector[size], float factor)
			
 
				+{
			
 
				+  int id, devid, err;
			
 
				+  cl_kernel kernel;
			
 
				+  cl_command_queue queue;
			
 
				+  cl_event event;
			
 
				+
			
 
				+  cl_mem val = (cl_mem) vector;
			
 
				+
			
 
				+  id = starpu_worker_get_id ();
			
 
				+  devid = starpu_worker_get_devid (id);
			
 
				+
			
 
				+  /* Prepare to invoke the kernel.  In the future, this will be largely
			
 
				+     automated.  */
			
 
				+  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
			
 
				+				   "vector_mult_opencl", devid);
			
 
				+  if (err != CL_SUCCESS)
			
 
				+    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+
			
 
				+  err = clSetKernelArg (kernel, 0, sizeof (size), &size);
			
 
				+  err |= clSetKernelArg (kernel, 1, sizeof (val), &val);
			
 
				+  err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
			
 
				+  if (err)
			
 
				+    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+
			
 
				+  size_t global = size, local = 1;
			
 
				+  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0,
			
 
				+				NULL, &event);
			
 
				+  if (err != CL_SUCCESS)
			
 
				+    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+
			
 
				+  clFinish (queue);
			
 
				+  starpu_opencl_collect_stats (event);
			
 
				+  clReleaseEvent (event);
			
 
				+
			
 
				+  starpu_opencl_release_kernel (kernel);
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+
			
 
				+/* Declaration of the CUDA implementation.  The definition itself is in the
			
 
				+   `.cu' file itself.  */
			
 
				+
			
 
				+extern void vector_scal_cuda (unsigned int size, float vector[size], float factor)
			
 
				+  __attribute__ ((task_implementation ("cuda", vector_scal)));
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#define EPSILON 1e-3
			
 
				+static bool
			
 
				+check (unsigned int size, float vector[size], float factor)
			
 
				+{
			
 
				+  unsigned int i;
			
 
				+
			
 
				+  for (i = 0; i < size; i++)
			
 
				+    {
			
 
				+      if (fabs(vector[i] - i * factor) > i*factor*EPSILON)
			
 
				+        {
			
 
				+          fprintf(stderr, "%.2f != %.2f\n", vector[i], i*factor);
			
 
				+          return false;
			
 
				+        }
			
 
				+    }
			
 
				+  fprintf(stderr, "computation is correct\n");
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int
			
 
				+main (void)
			
 
				+{
			
 
				+  bool valid;
			
 
				+
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
			
 
				+  starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
			
 
				+				       &cl_programs, "");
			
 
				+#endif
			
 
				+
			
 
				+#define NX     0x100000
			
 
				+#define FACTOR 3.14
			
 
				+
			
 
				+  {
			
 
				+    float vector[NX] __attribute__ ((heap_allocated, registered));
			
 
				+
			
 
				+    unsigned int i;
			
 
				+    for (i = 0; i < NX; i++)
			
 
				+      vector[i] = (float) i;
			
 
				+
			
 
				+    vector_scal (NX, vector, FACTOR);
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+#pragma starpu acquire vector
			
 
				+    valid = check (NX, vector, FACTOR);
			
 
				+#pragma starpu release vector
			
 
				+
			
 
				+  } /* VECTOR is automatically freed here.  */
			
 
				+
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+  return valid ? EXIT_SUCCESS : EXIT_FAILURE;
			
 
				+}
			
--- a/doc/tutorial/vector_scal_plugin_cuda.cu
+++ b/doc/tutorial/vector_scal_plugin_cuda.cu
@@ -0,0 +1,44 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* CUDA implementation of the `vector_scal' task.  */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+static __global__ void
			
 
				+vector_mult_cuda (unsigned int n, float *val, float factor)
			
 
				+{
			
 
				+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+
			
 
				+  if (i < n)
			
 
				+    val[i] *= factor;
			
 
				+}
			
 
				+
			
 
				+extern "C" void
			
 
				+vector_scal_cuda (unsigned int size, float vector[], float factor)
			
 
				+{
			
 
				+  unsigned threads_per_block = 64;
			
 
				+  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
			
 
				+
			
 
				+  vector_mult_cuda <<< nblocks, threads_per_block, 0,
			
 
				+       starpu_cuda_get_local_stream () >>> (size, vector, factor);
			
 
				+
			
 
				+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				+}