/* StarPU --- Runtime system for heterogeneous multicore architectures.
 *
 * Copyright (C) 2013,2015,2017                           CNRS
 * Copyright (C) 2013                                     Université de Bordeaux
 * Copyright (C) 2012                                     Inria
 *
 * StarPU is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or (at
 * your option) any later version.
 *
 * StarPU is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * See the GNU Lesser General Public License in COPYING.LGPL for more details.
 */

/* This example showcases features of the StarPU GCC plug-in.  It defines a
   "vector scaling" task with multiple CPU implementations, an OpenCL
   implementation, and a CUDA implementation.

   Compiling it without `-fplugin=starpu.so' yields valid sequential code.  */

#include <math.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>


/* Declare and define the standard CPU implementation.  */

static void vector_scal (unsigned int size, float vector[size], float factor)
  __attribute__ ((task));

/* The CPU implementation.  */
static void
vector_scal (unsigned int size, float vector[size], float factor)
{
  unsigned int i;
  for (i = 0; i < size; i++)
    vector[i] *= factor;
}


#if defined STARPU_GCC_PLUGIN && defined __SSE__
/* The SSE-capable CPU implementation.  */

#include <xmmintrin.h>

static void vector_scal_sse (unsigned int size, float vector[size], float factor)
  __attribute__ ((task_implementation ("cpu", vector_scal)));

static void
vector_scal_sse (unsigned int size, float vector[size], float factor)
{
  unsigned int n_iterations = size / 4;

  __m128 *VECTOR = (__m128 *) vector;
  __m128 _FACTOR STARPU_ATTRIBUTE_ALIGNED(16);
  _FACTOR = _mm_set1_ps (factor);

  unsigned int i;
  for (i = 0; i < n_iterations; i++)
    VECTOR[i] = _mm_mul_ps (_FACTOR, VECTOR[i]);

  unsigned int remainder = size % 4;
  if (remainder != 0)
    {
      unsigned int start = 4 * n_iterations;
      for (i = start; i < start + remainder; ++i)
	vector[i] = factor * vector[i];
    }
}
#endif /* __SSE__ */


/* Declaration and definition of the OpenCL implementation.  */

#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL

#include <starpu_opencl.h>

/* The OpenCL programs, loaded from `main'.  */
static struct starpu_opencl_program cl_programs;

static void vector_scal_opencl (unsigned int size, float vector[size], float factor)
  __attribute__ ((task_implementation ("opencl", vector_scal)));

static void
vector_scal_opencl (unsigned int size, float vector[size], float factor)
{
  int id, devid, err;
  cl_kernel kernel;
  cl_command_queue queue;
  cl_event event;

  cl_mem val = (cl_mem) vector;

  id = starpu_worker_get_id ();
  devid = starpu_worker_get_devid (id);

  /* Prepare to invoke the kernel.  In the future, this will be largely
     automated.  */
  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
				   "vector_mult_opencl", devid);
  if (err != CL_SUCCESS)
    STARPU_OPENCL_REPORT_ERROR (err);

  err = clSetKernelArg (kernel, 0, sizeof (size), &size);
  err |= clSetKernelArg (kernel, 1, sizeof (val), &val);
  err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
  if (err)
    STARPU_OPENCL_REPORT_ERROR (err);

  size_t global = size, local = 1;
  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0,
				NULL, &event);
  if (err != CL_SUCCESS)
    STARPU_OPENCL_REPORT_ERROR (err);

  clFinish (queue);
  starpu_opencl_collect_stats (event);
  clReleaseEvent (event);

  starpu_opencl_release_kernel (kernel);
}

#endif


#ifdef STARPU_USE_CUDA

/* Declaration of the CUDA implementation.  The definition itself is in the
   `.cu' file itself.  */

extern void vector_scal_cuda (unsigned int size, float vector[size], float factor)
  __attribute__ ((task_implementation ("cuda", vector_scal)));

#endif


#define EPSILON 1e-3
static bool
check (unsigned int size, float vector[size], float factor)
{
  unsigned int i;

  for (i = 0; i < size; i++)
    {
      if (fabs(vector[i] - i * factor) > i*factor*EPSILON)
        {
          fprintf(stderr, "%.2f != %.2f\n", vector[i], i*factor);
          return false;
        }
    }
  fprintf(stderr, "computation is correct\n");
  return true;
}


int
main (void)
{
  bool valid;

#pragma starpu initialize

#if defined STARPU_GCC_PLUGIN && defined STARPU_USE_OPENCL
  starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
				       &cl_programs, "");
#endif

#define NX     0x100000
#define FACTOR 3.14

  {
    float vector[NX] __attribute__ ((heap_allocated, registered));

    unsigned int i;
    for (i = 0; i < NX; i++)
      vector[i] = (float) i;

    vector_scal (NX, vector, FACTOR);

#pragma starpu wait
#pragma starpu acquire vector
    valid = check (NX, vector, FACTOR);
#pragma starpu release vector

  } /* VECTOR is automatically freed here.  */

#pragma starpu shutdown

  return valid ? EXIT_SUCCESS : EXIT_FAILURE;
}