Browse Source

doc/tutorial: add gcc plugin version of vector_scal example

Nathalie Furmento 12 years ago
parent
commit
5688c8bee4

+ 15 - 1
doc/tutorial/Makefile

@@ -24,7 +24,7 @@ HAS_OPENCL	=	$(shell pkg-config --libs starpu-1.1 |grep -i opencl)
 %.o: %.cu
 	nvcc $(CFLAGS) $< -c
 
-TARGETS = hello_world vector_scal hello_world_plugin
+TARGETS = hello_world vector_scal hello_world_plugin vector_scal_plugin
 
 all: $(TARGETS)
 
@@ -45,5 +45,19 @@ vector_scal: $(VECTOR_SCAL_PREREQUISITES)
 hello_world_plugin: hello_world_plugin.c
 	$(CC) $(CFLAGS) -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` $(LDFLAGS) $^ -o $@
 
+VECTOR_SCAL_PLUGIN_PREREQUISITES	=	vector_scal_plugin.o
+ifneq ($(strip $(HAS_CUDA)),)
+VECTOR_SCAL_PLUGIN_PREREQUISITES	+=	vector_scal_plugin_cuda.o
+VECTOR_SCAL_PLUGIN_COMPILER		=	$(NVCC)
+else
+VECTOR_SCAL_PLUGIN_COMPILER		=	$(CC)
+endif
+
+vector_scal_plugin.o: vector_scal_plugin.c
+	$(CC) -c $(CFLAGS) -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` $^ -o $@
+
+vector_scal_plugin: $(VECTOR_SCAL_PLUGIN_PREREQUISITES)
+	$(CC) -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` $(LDFLAGS) $(VECTOR_SCAL_PLUGIN_PREREQUISITES) -o $@
+
 clean:
 	rm -f $(TARGETS) *.o

+ 4 - 0
doc/tutorial/README

@@ -47,3 +47,7 @@ Instructions on how to compile and run StarPU examples
 % make hello_world_plugin
 % ./hello_world_plugin
 
+% make vector_scal_plugin
+% STARPU_NCPU=0 STARPU_NCUDA=0 ./vector_scal_plugin
+% STARPU_NCPU=0 STARPU_NOPENCL=0 ./vector_scal_plugin
+% STARPU_NOPENCL=0 STARPU_NCUDA=0 ./vector_scal_plugin

+ 194 - 0
doc/tutorial/vector_scal_plugin.c

@@ -0,0 +1,194 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* This example showcases features of the StarPU GCC plug-in.  It defines a
+   "vector scaling" task with multiple CPU implementations, an OpenCL
+   implementation, and a CUDA implementation.
+
+   Compiling it without `-fplugin=starpu.so' yields valid sequential code.  */
+
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+/* Declare and define the standard CPU implementation.  */
+
+static void vector_scal (unsigned int size, float vector[size], float factor)
+  __attribute__ ((task));
+
+/* The CPU implementation.  */
+static void
+vector_scal (unsigned int size, float vector[size], float factor)
+{
+  unsigned int i;
+  for (i = 0; i < size; i++)
+    vector[i] *= factor;
+}
+
+
+#if defined STARPU_GCC_PLUGIN && defined __SSE__
+/* The SSE-capable CPU implementation.  */
+
+#include <xmmintrin.h>
+
+static void vector_scal_sse (unsigned int size, float vector[size], float factor)
+  __attribute__ ((task_implementation ("cpu", vector_scal)));
+
+static void
+vector_scal_sse (unsigned int size, float vector[size], float factor)
+{
+  unsigned int n_iterations = size / 4;
+
+  __m128 *VECTOR = (__m128 *) vector;
+  __m128 _FACTOR __attribute__ ((aligned (16)));
+  _FACTOR = _mm_set1_ps (factor);
+
+  unsigned int i;
+  for (i = 0; i < n_iterations; i++)
+    VECTOR[i] = _mm_mul_ps (_FACTOR, VECTOR[i]);
+
+  unsigned int remainder = size % 4;
+  if (remainder != 0)
+    {
+      unsigned int start = 4 * n_iterations;
+      for (i = start; i < start + remainder; ++i)
+	vector[i] = factor * vector[i];
+    }
+}
+#endif /* __SSE__ */
+
+
+/* Declaration and definition of the OpenCL implementation.  */
+
+#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
+
+#include <starpu_opencl.h>
+
+/* The OpenCL programs, loaded from `main'.  */
+static struct starpu_opencl_program cl_programs;
+
+static void vector_scal_opencl (unsigned int size, float vector[size], float factor)
+  __attribute__ ((task_implementation ("opencl", vector_scal)));
+
+static void
+vector_scal_opencl (unsigned int size, float vector[size], float factor)
+{
+  int id, devid, err;
+  cl_kernel kernel;
+  cl_command_queue queue;
+  cl_event event;
+
+  cl_mem val = (cl_mem) vector;
+
+  id = starpu_worker_get_id ();
+  devid = starpu_worker_get_devid (id);
+
+  /* Prepare to invoke the kernel.  In the future, this will be largely
+     automated.  */
+  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
+				   "vector_mult_opencl", devid);
+  if (err != CL_SUCCESS)
+    STARPU_OPENCL_REPORT_ERROR (err);
+
+  err = clSetKernelArg (kernel, 0, sizeof (size), &size);
+  err |= clSetKernelArg (kernel, 1, sizeof (val), &val);
+  err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
+  if (err)
+    STARPU_OPENCL_REPORT_ERROR (err);
+
+  size_t global = size, local = 1;
+  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0,
+				NULL, &event);
+  if (err != CL_SUCCESS)
+    STARPU_OPENCL_REPORT_ERROR (err);
+
+  clFinish (queue);
+  starpu_opencl_collect_stats (event);
+  clReleaseEvent (event);
+
+  starpu_opencl_release_kernel (kernel);
+}
+
+#endif
+
+
+#ifdef STARPU_USE_CUDA
+
+/* Declaration of the CUDA implementation.  The definition itself is in the
+   `.cu' file itself.  */
+
+extern void vector_scal_cuda (unsigned int size, float vector[size], float factor)
+  __attribute__ ((task_implementation ("cuda", vector_scal)));
+
+#endif
+
+
+#define EPSILON 1e-3
+static bool
+check (unsigned int size, float vector[size], float factor)
+{
+  unsigned int i;
+
+  for (i = 0; i < size; i++)
+    {
+      if (fabs(vector[i] - i * factor) > i*factor*EPSILON)
+        {
+          fprintf(stderr, "%.2f != %.2f\n", vector[i], i*factor);
+          return false;
+        }
+    }
+  fprintf(stderr, "computation is correct\n");
+  return true;
+}
+
+
+int
+main (void)
+{
+  bool valid;
+
+#pragma starpu initialize
+
+#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
+  starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
+				       &cl_programs, "");
+#endif
+
+#define NX     0x100000
+#define FACTOR 3.14
+
+  {
+    float vector[NX] __attribute__ ((heap_allocated, registered));
+
+    unsigned int i;
+    for (i = 0; i < NX; i++)
+      vector[i] = (float) i;
+
+    vector_scal (NX, vector, FACTOR);
+
+#pragma starpu wait
+#pragma starpu acquire vector
+    valid = check (NX, vector, FACTOR);
+#pragma starpu release vector
+
+  } /* VECTOR is automatically freed here.  */
+
+#pragma starpu shutdown
+
+  return valid ? EXIT_SUCCESS : EXIT_FAILURE;
+}

+ 44 - 0
doc/tutorial/vector_scal_plugin_cuda.cu

@@ -0,0 +1,44 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Institut National de Recherche en Informatique et Automatique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* CUDA implementation of the `vector_scal' task.  */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include <stdlib.h>
+
+static __global__ void
+vector_mult_cuda (unsigned int n, float *val, float factor)
+{
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < n)
+    val[i] *= factor;
+}
+
+extern "C" void
+vector_scal_cuda (unsigned int size, float vector[], float factor)
+{
+  unsigned threads_per_block = 64;
+  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
+
+  vector_mult_cuda <<< nblocks, threads_per_block, 0,
+       starpu_cuda_get_local_stream () >>> (size, vector, factor);
+
+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
+}