14 年之前 · ec83e69660
--- a/examples/pi/SobolQRNG/CforCUDA_SDK_license.txt
+++ b/examples/pi/SobolQRNG/CforCUDA_SDK_license.txt
--- a/examples/pi/SobolQRNG/sobol.h
+++ b/examples/pi/SobolQRNG/sobol.h
@@ -1,60 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-/*
			
 
				- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- *
			
 
				- * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				- * proprietary rights in and to this software and related documentation and 
			
 
				- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				- * of this software and related documentation without an express license 
			
 
				- * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				- * 
			
 
				- */
			
 
				- 
			
 
				- /*
			
 
				- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				- *
			
 
				- * Sobol Quasi-random Number Generator example
			
 
				- *
			
 
				- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				- * http://people.maths.ox.ac.uk/~gilesm/
			
 
				- *
			
 
				- * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				- * and Frances Kuo, University of New South Wales, Australia
			
 
				- * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				- *
			
 
				- * For theoretical background see:
			
 
				- *
			
 
				- * P. Bratley and B.L. Fox.
			
 
				- * Implementing Sobol's quasirandom sequence generator
			
 
				- * http://portal.acm.org/citation.cfm?id=42288
			
 
				- * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				- *
			
 
				- * S. Joe and F. Kuo.
			
 
				- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				- * http://portal.acm.org/citation.cfm?id=641879
			
 
				- * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				- */
			
 
				-
			
 
				-#ifndef SOBOL_H
			
 
				-#define SOBOL_H
			
 
				-
			
 
				-// Number of direction vectors is fixed to 32
			
 
				-#define n_directions 32
			
 
				-
			
 
				-#endif
			
--- a/examples/pi/SobolQRNG/sobol_gold.c
+++ b/examples/pi/SobolQRNG/sobol_gold.c
@@ -1,141 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-/*
			
 
				- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- *
			
 
				- * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				- * proprietary rights in and to this software and related documentation and 
			
 
				- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				- * of this software and related documentation without an express license 
			
 
				- * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				- * 
			
 
				- */
			
 
				- 
			
 
				- /*
			
 
				- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				- *
			
 
				- * Sobol Quasi-random Number Generator example
			
 
				- *
			
 
				- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				- * http://people.maths.ox.ac.uk/~gilesm/
			
 
				- *
			
 
				- * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				- * and Frances Kuo, University of New South Wales, Australia
			
 
				- * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				- *
			
 
				- * For theoretical background see:
			
 
				- *
			
 
				- * P. Bratley and B.L. Fox.
			
 
				- * Implementing Sobol's quasirandom sequence generator
			
 
				- * http://portal.acm.org/citation.cfm?id=42288
			
 
				- * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				- *
			
 
				- * S. Joe and F. Kuo.
			
 
				- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				- * http://portal.acm.org/citation.cfm?id=641879
			
 
				- * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				- */
			
 
				-
			
 
				-#include <stdio.h>
			
 
				-#include <stdlib.h>
			
 
				-#include <math.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-#include "sobol.h"
			
 
				-#include "sobol_gold.h"
			
 
				-#include "sobol_primitives.h"
			
 
				-
			
 
				-#define k_2powneg32 2.3283064E-10F
			
 
				-
			
 
				-// Create the direction numbers, based on the primitive polynomials.
			
 
				-void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
			
 
				-{
			
 
				-    unsigned int *v = directions;
			
 
				-
			
 
				-    int dim;
			
 
				-    for (dim = 0 ; dim < n_dimensions ; dim++)
			
 
				-    {
			
 
				-        // First dimension is a special case
			
 
				-        if (dim == 0)
			
 
				-        {
			
 
				-            int i;
			
 
				-            for (i = 0 ; i < n_directions ; i++)
			
 
				-            {
			
 
				-                // All m's are 1
			
 
				-                v[i] = 1 << (31 - i);
			
 
				-            }
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            int d = sobol_primitives[dim].degree;
			
 
				-            // The first direction numbers (up to the degree of the polynomial)
			
 
				-            // are simply v[i] = m[i] / 2^i (stored in Q0.32 format)
			
 
				-            int i;
			
 
				-            for (i = 0 ; i < d ; i++)
			
 
				-            {
			
 
				-                v[i] = sobol_primitives[dim].m[i] << (31 - i);
			
 
				-            }
			
 
				-            // The remaining direction numbers are computed as described in
			
 
				-            // the Bratley and Fox paper.
			
 
				-            // v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ v[i-d]/2^d
			
 
				-            for (i = d ; i < n_directions ; i++)
			
 
				-            {
			
 
				-                // First do the v[i-d] ^ v[i-d]/2^d part
			
 
				-                v[i] = v[i - d] ^ (v[i - d] >> d);
			
 
				-                // Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part
			
 
				-                // Note that the coefficients a[] are zero or one and for compactness in
			
 
				-                // the input tables they are stored as bits of a single integer. To extract
			
 
				-                // the relevant bit we use right shift and mask with 1.
			
 
				-                // For example, for a 10 degree polynomial there are ten useful bits in a,
			
 
				-                // so to get a[2] we need to right shift 7 times (to get the 8th bit into
			
 
				-                // the LSB) and then mask with 1.
			
 
				-                int j;
			
 
				-                for (j = 1 ; j < d ; j++)
			
 
				-                {
			
 
				-                    v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]);
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-        v += n_directions;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-// Reference model for generating Sobol numbers on the host
			
 
				-void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output)
			
 
				-{
			
 
				-    unsigned int *v = directions;
			
 
				-
			
 
				-    int d;
			
 
				-    for (d = 0 ; d < n_dimensions ; d++)
			
 
				-    {
			
 
				-        unsigned int X = 0;
			
 
				-        // x[0] is zero (in all dimensions)
			
 
				-        output[n_vectors * d] = 0.0;        
			
 
				-        int i;
			
 
				-        for (i = 1 ; i < n_vectors ; i++)
			
 
				-        {
			
 
				-            // x[i] = x[i-1] ^ v[c]
			
 
				-            //  where c is the index of the rightmost zero bit in i
			
 
				-            //  minus 1 (since C arrays count from zero)
			
 
				-            // In the Bratley and Fox paper this is equation (**)
			
 
				-            X ^= v[ffs(~(i - 1)) - 1];
			
 
				-            output[i + n_vectors * d] = (float)X * k_2powneg32;
			
 
				-        }
			
 
				-        v += n_directions;
			
 
				-    }
			
 
				-}
			
--- a/examples/pi/SobolQRNG/sobol_gold.h
+++ b/examples/pi/SobolQRNG/sobol_gold.h
@@ -1,61 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-/*
			
 
				- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- *
			
 
				- * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				- * proprietary rights in and to this software and related documentation and 
			
 
				- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				- * of this software and related documentation without an express license 
			
 
				- * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				- * 
			
 
				- */
			
 
				- 
			
 
				- /*
			
 
				- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				- *
			
 
				- * Sobol Quasi-random Number Generator example
			
 
				- *
			
 
				- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				- * http://people.maths.ox.ac.uk/~gilesm/
			
 
				- *
			
 
				- * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				- * and Frances Kuo, University of New South Wales, Australia
			
 
				- * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				- *
			
 
				- * For theoretical background see:
			
 
				- *
			
 
				- * P. Bratley and B.L. Fox.
			
 
				- * Implementing Sobol's quasirandom sequence generator
			
 
				- * http://portal.acm.org/citation.cfm?id=42288
			
 
				- * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				- *
			
 
				- * S. Joe and F. Kuo.
			
 
				- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				- * http://portal.acm.org/citation.cfm?id=641879
			
 
				- * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				- *
			
 
				- */
			
 
				-
			
 
				-#ifndef SOBOL_GOLD_H
			
 
				-#define SOBOL_GOLD_H
			
 
				-
			
 
				-void initSobolDirectionVectors(int n_dimensions, unsigned int *directions);
			
 
				-void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output);
			
 
				-
			
 
				-#endif
			
--- a/examples/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/pi/SobolQRNG/sobol_gpu.cu
@@ -1,170 +0,0 @@
 
				-/*
			
 
				- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- *
			
 
				- * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				- * proprietary rights in and to this software and related documentation and 
			
 
				- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				- * of this software and related documentation without an express license 
			
 
				- * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				- * 
			
 
				- */
			
 
				- 
			
 
				- /*
			
 
				- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				- *
			
 
				- * Sobol Quasi-random Number Generator example
			
 
				- *
			
 
				- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				- * http://people.maths.ox.ac.uk/~gilesm/
			
 
				- *
			
 
				- * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				- * and Frances Kuo, University of New South Wales, Australia
			
 
				- * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				- *
			
 
				- * For theoretical background see:
			
 
				- *
			
 
				- * P. Bratley and B.L. Fox.
			
 
				- * Implementing Sobol's quasirandom sequence generator
			
 
				- * http://portal.acm.org/citation.cfm?id=42288
			
 
				- * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				- *
			
 
				- * S. Joe and F. Kuo.
			
 
				- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				- * http://portal.acm.org/citation.cfm?id=641879
			
 
				- * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				- *
			
 
				- */
			
 
				-
			
 
				-#include "sobol.h"
			
 
				-#include "sobol_gpu.h"
			
 
				-#include <starpu.h>
			
 
				-#include <starpu_cuda.h>
			
 
				-
			
 
				-#define k_2powneg32 2.3283064E-10F
			
 
				-
			
 
				-__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
			
 
				-{
			
 
				-    __shared__ unsigned int v[n_directions];
			
 
				-
			
 
				-    // Offset into the correct dimension as specified by the
			
 
				-    // block y coordinate
			
 
				-    d_directions = d_directions + n_directions * blockIdx.y;
			
 
				-    d_output = d_output +  n_vectors * blockIdx.y;
			
 
				-
			
 
				-    // Copy the direction numbers for this dimension into shared
			
 
				-    // memory - there are only 32 direction numbers so only the
			
 
				-    // first 32 (n_directions) threads need participate.
			
 
				-    if (threadIdx.x < n_directions)
			
 
				-    {
			
 
				-	    v[threadIdx.x] = d_directions[threadIdx.x];
			
 
				-    }
			
 
				-    __syncthreads();
			
 
				-
			
 
				-    // Set initial index (i.e. which vector this thread is
			
 
				-    // computing first) and stride (i.e. step to the next vector
			
 
				-    // for this thread)
			
 
				-    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				-    int stride = gridDim.x * blockDim.x;
			
 
				-
			
 
				-    // Get the gray code of the index
			
 
				-    // c.f. Numerical Recipes in C, chapter 20
			
 
				-    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
			
 
				-    unsigned int g = i0 ^ (i0 >> 1);
			
 
				-
			
 
				-    // Initialisation for first point x[i0]
			
 
				-    // In the Bratley and Fox paper this is equation (*), where
			
 
				-    // we are computing the value for x[n] without knowing the
			
 
				-    // value of x[n-1].
			
 
				-    unsigned int X = 0;
			
 
				-    unsigned int mask;
			
 
				-    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
			
 
				-    {
			
 
				-        // We want X ^= g_k * v[k], where g_k is one or zero.
			
 
				-        // We do this by setting a mask with all bits equal to
			
 
				-        // g_k. In reality we keep shifting g so that g_k is the
			
 
				-        // LSB of g. This way we avoid multiplication.
			
 
				-        mask = - (g & 1);
			
 
				-        X ^= mask & v[k];
			
 
				-        g = g >> 1;
			
 
				-    }
			
 
				-    if (i0 < n_vectors)
			
 
				-    {
			
 
				-        d_output[i0] = (float)X * k_2powneg32;
			
 
				-    }
			
 
				-
			
 
				-    // Now do rest of points, using the stride
			
 
				-    // Here we want to generate x[i] from x[i-stride] where we
			
 
				-    // don't have any of the x in between, therefore we have to
			
 
				-    // revisit the equation (**), this is easiest with an example
			
 
				-    // so assume stride is 16.
			
 
				-    // From x[n] to x[n+16] there will be:
			
 
				-    //   8 changes in the first bit
			
 
				-    //   4 changes in the second bit
			
 
				-    //   2 changes in the third bit
			
 
				-    //   1 change in the fourth
			
 
				-    //   1 change in one of the remaining bits
			
 
				-    //
			
 
				-    // What this means is that in the equation:
			
 
				-    //   x[n+1] = x[n] ^ v[p]
			
 
				-    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
			
 
				-    //   ...
			
 
				-    // We will apply xor with v[1] eight times, v[2] four times,
			
 
				-    // v[3] twice, v[4] once and one other direction number once.
			
 
				-    // Since two xors cancel out, we can skip even applications
			
 
				-    // and just apply xor with v[4] (i.e. log2(16)) and with
			
 
				-    // the current applicable direction number.
			
 
				-    // Note that all these indices count from 1, so we need to
			
 
				-    // subtract 1 from them all to account for C arrays counting
			
 
				-    // from zero.
			
 
				-    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
			
 
				-    unsigned int v_stridemask = stride - 1;
			
 
				-    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
			
 
				-    {
			
 
				-        // x[i] = x[i-stride] ^ v[b] ^ v[c]
			
 
				-        //  where b is log2(stride) minus 1 for C array indexing
			
 
				-        //  where c is the index of the rightmost zero bit in i,
			
 
				-        //  not including the bottom log2(stride) bits, minus 1
			
 
				-        //  for C array indexing
			
 
				-        // In the Bratley and Fox paper this is equation (**)
			
 
				-        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
			
 
				-        d_output[i] = (float)X * k_2powneg32;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-extern "C"
			
 
				-void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
			
 
				-{
			
 
				-    const int threadsperblock = 64;
			
 
				-
			
 
				-    // Set up the execution configuration
			
 
				-    dim3 dimGrid;
			
 
				-    dim3 dimBlock;
			
 
				-
			
 
				-    // This implementation of the generator outputs all the draws for
			
 
				-    // one dimension in a contiguous region of memory, followed by the
			
 
				-    // next dimension and so on.
			
 
				-    // Therefore all threads within a block will be processing different
			
 
				-    // vectors from the same dimension. As a result we want the total
			
 
				-    // number of blocks to be a multiple of the number of dimensions.
			
 
				-    dimGrid.y = n_dimensions;
			
 
				-
			
 
				-    // If the number of dimensions is large then we will set the number
			
 
				-    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
			
 
				-    // but if the number of dimensions is small (e.g. less than 32) then
			
 
				-    // we'll partition the vectors across blocks (as well as threads).
			
 
				-    // We also need to cap the dimGrid.x where the number of vectors
			
 
				-    // is too small to be partitioned.
			
 
				-    dimGrid.x = 1 + 31 / n_dimensions;
			
 
				-    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
			
 
				-    {
			
 
				-        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
			
 
				-    }
			
 
				-    
			
 
				-    // Fix the number of threads
			
 
				-    dimBlock.x = threadsperblock;
			
 
				-
			
 
				-    // Execute GPU kernel
			
 
				-    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
			
 
				-}
			
--- a/examples/pi/SobolQRNG/sobol_gpu.h
+++ b/examples/pi/SobolQRNG/sobol_gpu.h
@@ -1,61 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-/*
			
 
				- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- *
			
 
				- * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				- * proprietary rights in and to this software and related documentation and 
			
 
				- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				- * of this software and related documentation without an express license 
			
 
				- * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				- * 
			
 
				- */
			
 
				- 
			
 
				- /*
			
 
				- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				- *
			
 
				- * Sobol Quasi-random Number Generator example
			
 
				- *
			
 
				- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				- * http://people.maths.ox.ac.uk/~gilesm/
			
 
				- *
			
 
				- * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				- * and Frances Kuo, University of New South Wales, Australia
			
 
				- * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				- *
			
 
				- * For theoretical background see:
			
 
				- *
			
 
				- * P. Bratley and B.L. Fox.
			
 
				- * Implementing Sobol's quasirandom sequence generator
			
 
				- * http://portal.acm.org/citation.cfm?id=42288
			
 
				- * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				- *
			
 
				- * S. Joe and F. Kuo.
			
 
				- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				- * http://portal.acm.org/citation.cfm?id=641879
			
 
				- * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				- *
			
 
				- */
			
 
				-
			
 
				-#ifndef SOBOL_GPU_H
			
 
				-#define SOBOL_GPU_H
			
 
				-
			
 
				-extern "C"
			
 
				-void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output);
			
 
				-
			
 
				-#endif
			
--- a/examples/pi/SobolQRNG/sobol_primitives.c
+++ b/examples/pi/SobolQRNG/sobol_primitives.c
--- a/examples/pi/SobolQRNG/sobol_primitives.h
+++ b/examples/pi/SobolQRNG/sobol_primitives.h
@@ -1,75 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-/*
			
 
				- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- *
			
 
				- * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				- * proprietary rights in and to this software and related documentation and 
			
 
				- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				- * of this software and related documentation without an express license 
			
 
				- * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				- * 
			
 
				- */
			
 
				- 
			
 
				- /*
			
 
				- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				- *
			
 
				- * Sobol Quasi-random Number Generator example
			
 
				- *
			
 
				- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				- * http://people.maths.ox.ac.uk/~gilesm/
			
 
				- *
			
 
				- * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				- * and Frances Kuo, University of New South Wales, Australia
			
 
				- * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				- *
			
 
				- * For theoretical background see:
			
 
				- *
			
 
				- * P. Bratley and B.L. Fox.
			
 
				- * Implementing Sobol's quasirandom sequence generator
			
 
				- * http://portal.acm.org/citation.cfm?id=42288
			
 
				- * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				- *
			
 
				- * S. Joe and F. Kuo.
			
 
				- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				- * http://portal.acm.org/citation.cfm?id=641879
			
 
				- * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				- *
			
 
				- */
			
 
				-
			
 
				-#ifndef SOBOL_PRIMITIVES_H
			
 
				-#define SOBOL_PRIMITIVES_H
			
 
				-
			
 
				-#define max_m 17
			
 
				-
			
 
				-// Each primitive is stored as a struct where
			
 
				-//  dimension is the dimension number of the polynomial (unused)
			
 
				-//  degree is the degree of the polynomial
			
 
				-//  a is a binary word representing the coefficients 
			
 
				-//  m is the array of m values
			
 
				-struct primitive
			
 
				-{
			
 
				-    unsigned int dimension;
			
 
				-    unsigned int degree;
			
 
				-    unsigned int a;
			
 
				-    unsigned int m[max_m];
			
 
				-};
			
 
				-
			
 
				-extern const struct primitive sobol_primitives[];
			
 
				-
			
 
				-#endif
			
--- a/examples/pi/pi.c
+++ b/examples/pi/pi.c
@@ -1,177 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "SobolQRNG/sobol.h"
			
 
				-#include "SobolQRNG/sobol_gold.h"
			
 
				-#include "pi.h"
			
 
				-#include <sys/time.h>
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void cuda_kernel(void **descr, void *cl_arg);
			
 
				-#endif
			
 
				-
			
 
				-/* default value */
			
 
				-static unsigned ntasks = 1024;
			
 
				-
			
 
				-static void cpu_kernel(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-	unsigned nx = NSHOT_PER_TASK;
			
 
				-
			
 
				-	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
			
 
				-	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
			
 
				-
			
 
				-	TYPE *random_numbers_x = &random_numbers[0];
			
 
				-	TYPE *random_numbers_y = &random_numbers[nx];
			
 
				-
			
 
				-	unsigned current_cnt = 0;
			
 
				-
			
 
				-	unsigned i;
			
 
				-	for (i = 0; i < nx; i++)
			
 
				-	{
			
 
				-		TYPE x = random_numbers_x[i];
			
 
				-		TYPE y = random_numbers_y[i];
			
 
				-
			
 
				-		TYPE dist = (x*x + y*y);
			
 
				-
			
 
				-		unsigned success = (dist <= 1.0);
			
 
				-		current_cnt += success;
			
 
				-	}
			
 
				-
			
 
				-	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-	*cnt = current_cnt;
			
 
				-
			
 
				-	free(random_numbers);
			
 
				-}
			
 
				-
			
 
				-static void parse_args(int argc, char **argv)
			
 
				-{
			
 
				-	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-ntasks") == 0) {
			
 
				-			char *argptr;
			
 
				-			ntasks = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	unsigned i;
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	starpu_init(NULL);
			
 
				-
			
 
				-	/* Initialize the random number generator */
			
 
				-	unsigned *sobol_qrng_directions = malloc(n_dimensions*n_directions*sizeof(unsigned));
			
 
				-	STARPU_ASSERT(sobol_qrng_directions);
			
 
				-
			
 
				-	initSobolDirectionVectors(n_dimensions, sobol_qrng_directions);
			
 
				-
			
 
				-	/* Any worker may use that array now */
			
 
				-	starpu_data_handle sobol_qrng_direction_handle;
			
 
				-	starpu_vector_data_register(&sobol_qrng_direction_handle, 0,
			
 
				-		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
			
 
				-
			
 
				-	unsigned *cnt_array = malloc(ntasks*sizeof(unsigned));
			
 
				-	STARPU_ASSERT(cnt_array);
			
 
				-	starpu_data_handle cnt_array_handle;
			
 
				-	starpu_vector_data_register(&cnt_array_handle, 0, (uintptr_t)cnt_array, ntasks, sizeof(unsigned));
			
 
				-
			
 
				-	/* Use a write-through policy : when the data is modified on an
			
 
				-	 * accelerator, we know that it will only be modified once and be
			
 
				-	 * accessed by the CPU later on */
			
 
				-	starpu_data_set_wt_mask(cnt_array_handle, (1<<0));
			
 
				-
			
 
				-	struct starpu_data_filter f = {
			
 
				-		.filter_func = starpu_block_filter_func_vector,
			
 
				-		.nchildren = ntasks,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				-	};
			
 
				-	
			
 
				-	starpu_data_partition(cnt_array_handle, &f);
			
 
				-
			
 
				-	static struct starpu_perfmodel_t model = {
			
 
				-		.type = STARPU_HISTORY_BASED,
			
 
				-		.symbol = "monte_carlo_pi"
			
 
				-	};
			
 
				-
			
 
				-	struct starpu_codelet_t cl = {
			
 
				-		.where = STARPU_CPU|STARPU_CUDA,
			
 
				-		.cpu_func = cpu_kernel,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		.cuda_func = cuda_kernel,
			
 
				-#endif
			
 
				-		.nbuffers = 2,
			
 
				-		.model = &model
			
 
				-	};
			
 
				-
			
 
				-	struct timeval start;
			
 
				-	struct timeval end;
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	for (i = 0; i < ntasks; i++)
			
 
				-	{
			
 
				-		struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-		task->cl = &cl;
			
 
				-
			
 
				-		STARPU_ASSERT(starpu_data_get_sub_data(cnt_array_handle, 1, i));
			
 
				-
			
 
				-		task->buffers[0].handle = sobol_qrng_direction_handle;
			
 
				-		task->buffers[0].mode   = STARPU_R;
			
 
				-		task->buffers[1].handle = starpu_data_get_sub_data(cnt_array_handle, 1, i);
			
 
				-		task->buffers[1].mode   = STARPU_W;
			
 
				-
			
 
				-		int ret = starpu_task_submit(task);
			
 
				-		STARPU_ASSERT(!ret);
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	/* Get the cnt_array back in main memory */
			
 
				-	starpu_data_unpartition(cnt_array_handle, 0);
			
 
				-	starpu_data_acquire(cnt_array_handle, STARPU_RW);
			
 
				-
			
 
				-	/* Count the total number of entries */
			
 
				-	unsigned long total_cnt = 0;
			
 
				-	for (i = 0; i < ntasks; i++)
			
 
				-		total_cnt += cnt_array[i];
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-
			
 
				-	unsigned long total_shot_cnt = ntasks * NSHOT_PER_TASK;
			
 
				-
			
 
				-	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
			
 
				-	fprintf(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(total_shot_cnt), total_cnt, total_shot_cnt);
			
 
				-	fprintf(stderr, "Total time : %f ms\n", timing/1000.0);
			
 
				-	fprintf(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
			
 
				-
			
 
				-	starpu_data_release(cnt_array_handle);
			
 
				-
			
 
				-	starpu_display_codelet_stats(&cl);
			
 
				-
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/pi/pi.h
+++ b/examples/pi/pi.h
@@ -1,33 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __PI_H__
			
 
				-#define __PI_H__
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <starpu_cuda.h>
			
 
				-#include <stdio.h>
			
 
				-
			
 
				-#define NSHOT_PER_TASK	(16*1024*1024ULL)
			
 
				-
			
 
				-#define TYPE	float
			
 
				-
			
 
				-//extern "C" void cuda_kernel(void *descr[], void *cl_arg);
			
 
				-
			
 
				-static int n_dimensions = 100;
			
 
				-
			
 
				-#endif // __PI_H__
			
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -1,150 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "SobolQRNG/sobol_gpu.h"
			
 
				-#include "pi.h"
			
 
				-#include <starpu_cuda.h>
			
 
				-
			
 
				-#define MAXNBLOCKS	128
			
 
				-#define MAXTHREADSPERBLOCK	256
			
 
				-
			
 
				-static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
			
 
				-						unsigned n, unsigned *output_cnt)
			
 
				-{
			
 
				-	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
			
 
				-
			
 
				-	/* Do we have a successful shot ? */
			
 
				-	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
			
 
				-
			
 
				-	const int nthreads = gridDim.x * blockDim.x;
			
 
				-
			
 
				-	/* Blank the shared mem buffer */
			
 
				-	if (threadIdx.x < MAXTHREADSPERBLOCK)
			
 
				-		scnt[threadIdx.x] = 0;
			
 
				-
			
 
				-	__syncthreads();
			
 
				-	int ind;
			
 
				-	for (ind = tid; ind < n; ind += nthreads)
			
 
				-	{ 
			
 
				-		TYPE x = random_numbers_x[ind];
			
 
				-		TYPE y = random_numbers_y[ind];
			
 
				-		TYPE dist = (x*x + y*y);
			
 
				-
			
 
				-		unsigned success = (dist <= 1.0f)?1:0;
			
 
				-
			
 
				-		scnt[threadIdx.x] += success;
			
 
				-
			
 
				-	}
			
 
				-
			
 
				-	__syncthreads();
			
 
				-
			
 
				-	/* Perform a reduction to compute the sum on each thread within that block */
			
 
				-
			
 
				-	/* NB: We assume that the number of threads per block is a power of 2 ! */
			
 
				-	unsigned s;
			
 
				-	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				-	{
			
 
				-		if (threadIdx.x < s)
			
 
				-			scnt[threadIdx.x] += scnt[threadIdx.x + s];
			
 
				-
			
 
				-		__syncthreads();
			
 
				-	}
			
 
				-
			
 
				-	/* report the number of successful shots in the block */
			
 
				-	if (threadIdx.x == 0)
			
 
				-		output_cnt[blockIdx.x] = scnt[0];
			
 
				-
			
 
				-	__syncthreads();
			
 
				-}
			
 
				-
			
 
				-static __global__ void sum_per_block_cnt(unsigned *output_cnt, unsigned *cnt)
			
 
				-{
			
 
				-	__shared__ unsigned accumulator[MAXNBLOCKS];
			
 
				-
			
 
				-	unsigned i;
			
 
				-
			
 
				-	/* Load the values from global mem */
			
 
				-	for (i = 0; i < blockDim.x; i++)
			
 
				-		accumulator[i] = output_cnt[i];
			
 
				-
			
 
				-	__syncthreads();
			
 
				-
			
 
				-	/* Perform a reduction in shared memory */
			
 
				-	unsigned s;
			
 
				-	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				-	{
			
 
				-		if (threadIdx.x < s)
			
 
				-			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
			
 
				-
			
 
				-		__syncthreads();
			
 
				-	}
			
 
				-
			
 
				-	/* Save the result in global memory */
			
 
				-	if (threadIdx.x == 0)
			
 
				-		*cnt = accumulator[0];
			
 
				-}
			
 
				-
			
 
				-extern "C" void cuda_kernel(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-	unsigned nx = NSHOT_PER_TASK;
			
 
				-
			
 
				-	/* Generate Random numbers */
			
 
				-	float *random_numbers;
			
 
				-	cudaMalloc((void **)&random_numbers, 2*nx*sizeof(float));
			
 
				-	STARPU_ASSERT(random_numbers);
			
 
				-	
			
 
				-	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-
			
 
				-	TYPE *random_numbers_x = &random_numbers[0];
			
 
				-	TYPE *random_numbers_y = &random_numbers[nx];
			
 
				-
			
 
				-	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-
			
 
				-	/* How many blocks do we use ? */ 
			
 
				-	unsigned nblocks = 128; // TODO
			
 
				-
			
 
				-	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
			
 
				-	
			
 
				-	unsigned *per_block_cnt;
			
 
				-	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned));
			
 
				-
			
 
				-	STARPU_ASSERT((nx % nblocks) == 0);
			
 
				-
			
 
				-	/* How many threads per block ? At most 256, but no more threads than
			
 
				-	 * there are entries to process per block. */
			
 
				-	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
			
 
				-
			
 
				-	/* each entry of per_block_cnt contains the number of successful shots
			
 
				-	 * in the corresponding block. */
			
 
				-	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				-
			
 
				-	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				-
			
 
				-	/* compute the total number of successful shots by adding the elements
			
 
				-	 * of the per_block_cnt array */
			
 
				-	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
			
 
				-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-	if (cures)
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cudaFree(per_block_cnt);
			
 
				-	cudaFree(random_numbers);
			
 
				-}
			
--- a/examples/pi/pi_redux.c
+++ b/examples/pi/pi_redux.c
@@ -1,318 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <stdlib.h>
			
 
				-#include <sys/time.h>
			
 
				-#include <starpu_config.h>
			
 
				-
			
 
				-#define PI	3.14159265358979323846
			
 
				-
			
 
				-#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CURAND)
			
 
				-#warning CURAND is required to run that example on CUDA devices
			
 
				-#endif
			
 
				-
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-#include <cuda.h>
			
 
				-#include <curand.h>
			
 
				-#include <starpu_cuda.h>
			
 
				-#endif
			
 
				-
			
 
				-#define NSHOT_PER_TASK	(1024*1024)
			
 
				-
			
 
				-/* default value */
			
 
				-static unsigned long ntasks = 1024;
			
 
				-
			
 
				-/*
			
 
				- *	Initialization of the Random Number Generators (RNG)
			
 
				- */
			
 
				-
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-/* RNG for the CURAND library */
			
 
				-static curandGenerator_t curandgens[STARPU_NMAXWORKERS];
			
 
				-#endif 
			
 
				-
			
 
				-/* state for the erand48 function */
			
 
				-static unsigned short xsubi[3*STARPU_NMAXWORKERS];
			
 
				-
			
 
				-/* Function to initialize the random number generator in the current worker */
			
 
				-static void init_rng(void *arg __attribute__((unused)))
			
 
				-{
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-	curandStatus_t res;
			
 
				-#endif
			
 
				-
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				-
			
 
				-	switch (starpu_worker_get_type(workerid)) {
			
 
				-		case STARPU_CPU_WORKER:
			
 
				-			/* create a seed */
			
 
				-			xsubi[0 + 3*workerid] = (unsigned short)workerid;
			
 
				-			xsubi[1 + 3*workerid] = (unsigned short)workerid;
			
 
				-			xsubi[2 + 3*workerid] = (unsigned short)workerid;
			
 
				-			break;
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-		case STARPU_CUDA_WORKER:
			
 
				-
			
 
				-			/* Create a RNG */
			
 
				-			res = curandCreateGenerator(&curandgens[workerid],
			
 
				-						CURAND_RNG_PSEUDO_DEFAULT);
			
 
				-			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
			
 
				-
			
 
				-			/* Seed it with worker's id */
			
 
				-			res = curandSetPseudoRandomGeneratorSeed(curandgens[workerid],
			
 
				-							(unsigned long long)workerid);
			
 
				-			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
			
 
				-			break;
			
 
				-#endif
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				-			break;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void parse_args(int argc, char **argv)
			
 
				-{
			
 
				-	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-ntasks") == 0) {
			
 
				-			char *argptr;
			
 
				-			ntasks = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- *	Monte-carlo kernel
			
 
				- */
			
 
				-
			
 
				-static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
			
 
				-{
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				-
			
 
				-	unsigned short *worker_xsub;
			
 
				-	worker_xsub = &xsubi[3*workerid];
			
 
				-
			
 
				-	unsigned long local_cnt = 0;
			
 
				-
			
 
				-	/* Fill the scratchpad with random numbers */
			
 
				-	int i;
			
 
				-	for (i = 0; i < NSHOT_PER_TASK; i++)
			
 
				-	{
			
 
				-		float x = (float)(2.0*starpu_erand48(worker_xsub) - 1.0);
			
 
				-		float y = (float)(2.0*starpu_erand48(worker_xsub) - 1.0);
			
 
				-
			
 
				-		float dist = x*x + y*y;
			
 
				-		if (dist < 1.0)
			
 
				-			local_cnt++;
			
 
				-	}
			
 
				-
			
 
				-	/* Put the contribution of that task into the counter */
			
 
				-	unsigned long *cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				-	*cnt = *cnt + local_cnt;
			
 
				-}
			
 
				-
			
 
				-extern void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt);
			
 
				-
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
			
 
				-{
			
 
				-	cudaError_t cures;
			
 
				-	curandStatus_t res;	
			
 
				-
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				-
			
 
				-	/* CURAND is a bit silly: it assumes that any error is fatal. Calling
			
 
				-	 * cudaGetLastError resets the last error value. */
			
 
				-	cures = cudaGetLastError();
			
 
				-//	if (cures)
			
 
				-//		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	/* Fill the scratchpad with random numbers. Note that both x and y
			
 
				-	 * arrays are in stored the same vector. */
			
 
				-	float *scratchpad_xy = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*NSHOT_PER_TASK);
			
 
				-	STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
			
 
				-
			
 
				-	float *x = &scratchpad_xy[0];
			
 
				-	float *y = &scratchpad_xy[NSHOT_PER_TASK];
			
 
				-
			
 
				-	unsigned long *shot_cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				-	pi_redux_cuda_kernel(x, y, NSHOT_PER_TASK, shot_cnt);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static struct starpu_codelet_t pi_cl = {
			
 
				-	.where =
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				-	.cpu_func = pi_func_cpu,
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-	.cuda_func = pi_func_cuda,
			
 
				-#endif
			
 
				-	.nbuffers = 2,
			
 
				-	.model = NULL
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- *	Codelets to implement reduction
			
 
				- */
			
 
				-
			
 
				-static void init_cpu_func(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-        *val = 0;
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-static void init_cuda_func(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-        cudaMemset(val, 0, sizeof(unsigned long));
			
 
				-        cudaThreadSynchronize();
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static struct starpu_codelet_t init_codelet = {
			
 
				-	.where =
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				-        .cpu_func = init_cpu_func,
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-        .cuda_func = init_cuda_func,
			
 
				-#endif
			
 
				-        .nbuffers = 1
			
 
				-};
			
 
				-
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-/* Dummy implementation of the addition of two unsigned longs in CUDA */
			
 
				-static void redux_cuda_func(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-	unsigned long *d_a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-	unsigned long *d_b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				-
			
 
				-	unsigned long h_a, h_b;
			
 
				-	
			
 
				-	cudaMemcpy(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost);
			
 
				-	cudaMemcpy(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost);
			
 
				-
			
 
				-	h_a += h_b;
			
 
				-
			
 
				-	cudaMemcpy(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice);
			
 
				-};
			
 
				-#endif
			
 
				-
			
 
				-static void redux_cpu_func(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-	unsigned long *a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				-
			
 
				-	*a = *a + *b;
			
 
				-};
			
 
				-
			
 
				-static struct starpu_codelet_t redux_codelet = {
			
 
				-	.where =
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				-	.cpu_func = redux_cpu_func,
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-	.cuda_func = redux_cuda_func,
			
 
				-#endif
			
 
				-	.nbuffers = 2
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- *	Main program
			
 
				- */
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	unsigned i;
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	starpu_init(NULL);
			
 
				-
			
 
				-	/* Launch a Random Number Generator (RNG) on each worker */
			
 
				-	starpu_execute_on_each_worker(init_rng, NULL, STARPU_CPU|STARPU_CUDA);
			
 
				-
			
 
				-	/* Create a scratchpad data */
			
 
				-	starpu_data_handle xy_scratchpad_handle;
			
 
				-	starpu_vector_data_register(&xy_scratchpad_handle, -1, (uintptr_t)NULL,
			
 
				-		2*NSHOT_PER_TASK, sizeof(float));
			
 
				-
			
 
				-	/* Create a variable that will be used to count the number of shots
			
 
				-	 * that actually hit the unit circle when shooting randomly in
			
 
				-	 * [-1,1]^2. */
			
 
				-	unsigned long shot_cnt = 0;
			
 
				-	starpu_data_handle shot_cnt_handle;
			
 
				-	starpu_variable_data_register(&shot_cnt_handle, 0,
			
 
				-			(uintptr_t)&shot_cnt, sizeof(shot_cnt));
			
 
				-
			
 
				-	starpu_data_set_reduction_methods(shot_cnt_handle,
			
 
				-					&redux_codelet, &init_codelet);
			
 
				-
			
 
				-	struct timeval start;
			
 
				-	struct timeval end;
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	for (i = 0; i < ntasks; i++)
			
 
				-	{
			
 
				-		struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-		task->cl = &pi_cl;
			
 
				-
			
 
				-		task->buffers[0].handle = xy_scratchpad_handle;
			
 
				-		task->buffers[0].mode   = STARPU_SCRATCH;
			
 
				-		task->buffers[1].handle = shot_cnt_handle;
			
 
				-		task->buffers[1].mode   = STARPU_REDUX;
			
 
				-
			
 
				-		int ret = starpu_task_submit(task);
			
 
				-		STARPU_ASSERT(!ret);
			
 
				-	}
			
 
				-
			
 
				-	starpu_data_acquire(shot_cnt_handle, STARPU_R);
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
			
 
				-	 * probability to impact the disk: pi/4 */
			
 
				-	unsigned long total = ntasks*NSHOT_PER_TASK;
			
 
				-	double pi_approx = ((double)shot_cnt*4.0)/total;
			
 
				-
			
 
				-	starpu_data_release(shot_cnt_handle);
			
 
				-
			
 
				-
			
 
				-	fprintf(stderr, "Pi approximation : %lf (%ld / %ld)\n", pi_approx, shot_cnt, total);
			
 
				-	fprintf(stderr, "Error %le \n", pi_approx - PI);
			
 
				-	fprintf(stderr, "Total time : %f ms\n", timing/1000.0);
			
 
				-	fprintf(stderr, "Speed : %f GShot/s\n", total/(1e3*timing));
			
 
				-
			
 
				-	starpu_data_unregister(shot_cnt_handle);
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	if (abs(pi_approx - PI) > 1.0)
			
 
				-		return 1;
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/pi/pi_redux_kernel.cu
+++ b/examples/pi/pi_redux_kernel.cu
@@ -1,128 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <starpu_cuda.h>
			
 
				-
			
 
				-#define MAXNBLOCKS	128
			
 
				-#define MAXTHREADSPERBLOCK	256
			
 
				-
			
 
				-static __global__ void monte_carlo(float *x, float *y, unsigned n, unsigned long *output_cnt)
			
 
				-{
			
 
				-	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
			
 
				-
			
 
				-	/* Do we have a successful shot ? */
			
 
				-	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
			
 
				-
			
 
				-	const int nthreads = gridDim.x * blockDim.x;
			
 
				-
			
 
				-	/* Blank the shared mem buffer */
			
 
				-	if (threadIdx.x < MAXTHREADSPERBLOCK)
			
 
				-		scnt[threadIdx.x] = 0;
			
 
				-
			
 
				-	__syncthreads();
			
 
				-	int ind;
			
 
				-	for (ind = tid; ind < n; ind += nthreads)
			
 
				-	{ 
			
 
				-		float xval = (2.0f * x[ind] - 1.0f);
			
 
				-		float yval = (2.0f * y[ind] - 1.0f);
			
 
				-		float dist = (xval*xval + yval*yval);
			
 
				-
			
 
				-		unsigned long success = (dist <= 1.0f)?1:0;
			
 
				-
			
 
				-		scnt[threadIdx.x] += success;
			
 
				-
			
 
				-	}
			
 
				-
			
 
				-	__syncthreads();
			
 
				-
			
 
				-	/* Perform a reduction to compute the sum on each thread within that block */
			
 
				-
			
 
				-	/* NB: We assume that the number of threads per block is a power of 2 ! */
			
 
				-	unsigned long s;
			
 
				-	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				-	{
			
 
				-		if (threadIdx.x < s)
			
 
				-			scnt[threadIdx.x] += scnt[threadIdx.x + s];
			
 
				-
			
 
				-		__syncthreads();
			
 
				-	}
			
 
				-
			
 
				-	/* report the number of successful shots in the block */
			
 
				-	if (threadIdx.x == 0)
			
 
				-		output_cnt[blockIdx.x] = scnt[0];
			
 
				-
			
 
				-	__syncthreads();
			
 
				-}
			
 
				-
			
 
				-static __global__ void sum_per_block_cnt(unsigned long *output_cnt, unsigned long *cnt)
			
 
				-{
			
 
				-	__shared__ unsigned long accumulator[MAXNBLOCKS];
			
 
				-
			
 
				-	unsigned i;
			
 
				-
			
 
				-	/* Load the values from global mem */
			
 
				-	for (i = 0; i < blockDim.x; i++)
			
 
				-		accumulator[i] = output_cnt[i];
			
 
				-
			
 
				-	__syncthreads();
			
 
				-
			
 
				-	/* Perform a reduction in shared memory */
			
 
				-	unsigned s;
			
 
				-	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				-	{
			
 
				-		if (threadIdx.x < s)
			
 
				-			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
			
 
				-
			
 
				-		__syncthreads();
			
 
				-	}
			
 
				-
			
 
				-	/* Save the result in global memory */
			
 
				-	if (threadIdx.x == 0)
			
 
				-		*cnt = *cnt + accumulator[0];
			
 
				-}
			
 
				-
			
 
				-extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt)
			
 
				-{
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	/* How many blocks do we use ? */ 
			
 
				-	unsigned nblocks = 128; // TODO
			
 
				-	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
			
 
				-	STARPU_ASSERT((n % nblocks) == 0);
			
 
				-	
			
 
				-	unsigned long *per_block_cnt;
			
 
				-	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned long));
			
 
				-
			
 
				-	/* How many threads per block ? At most 256, but no more threads than
			
 
				-	 * there are entries to process per block. */
			
 
				-	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (n / nblocks));
			
 
				-
			
 
				-	/* each entry of per_block_cnt contains the number of successful shots
			
 
				-	 * in the corresponding block. */
			
 
				-	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
			
 
				-
			
 
				-	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				-
			
 
				-	/* compute the total number of successful shots by adding the elements
			
 
				-	 * of the per_block_cnt array */
			
 
				-	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
			
 
				-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-	if (cures)
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cudaFree(per_block_cnt);
			
 
				-}
			
--- a/examples/spmv/dw_spmv.c
+++ b/examples/spmv/dw_spmv.c
@@ -1,350 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * Conjugate gradients for Sparse matrices
			
 
				- */
			
 
				-
			
 
				-#include "dw_spmv.h"
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-extern void spmv_kernel_cuda(void *descr[], void *args);
			
 
				-#endif
			
 
				-
			
 
				-struct timeval start;
			
 
				-struct timeval end;
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-#include "starpu_opencl.h"
			
 
				-struct starpu_opencl_program opencl_codelet;
			
 
				-void spmv_kernel_opencl(void *descr[], void *args)
			
 
				-{
			
 
				-	cl_kernel kernel;
			
 
				-	cl_command_queue queue;
			
 
				-	cl_event event;
			
 
				-	int id, devid, err, n;
			
 
				-
			
 
				-	uint32_t nnz = STARPU_CSR_GET_NNZ(descr[0]);
			
 
				-	uint32_t nrow = STARPU_CSR_GET_NROW(descr[0]);
			
 
				-	float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
			
 
				-	uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
			
 
				-	uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
			
 
				-	uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
			
 
				-
			
 
				-	float *vecin = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-	uint32_t nx_in = STARPU_VECTOR_GET_NX(descr[1]);
			
 
				-
			
 
				-	float *vecout = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				-	uint32_t nx_out = STARPU_VECTOR_GET_NX(descr[2]);
			
 
				-
			
 
				-        id = starpu_worker_get_id();
			
 
				-        devid = starpu_worker_get_devid(id);
			
 
				-
			
 
				-        err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_codelet, "spvm", devid);
			
 
				-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = 0;
			
 
				-        n=0;
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nnz);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nrow);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &nzval);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &colind);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &rowptr);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &firstentry);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecin);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_in);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecout);
			
 
				-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_out);
			
 
				-        if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	{
			
 
				-                size_t global=1024;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
			
 
				-		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-	}
			
 
				-
			
 
				-	clFinish(queue);
			
 
				-	starpu_opencl_collect_stats(event);
			
 
				-	clReleaseEvent(event);
			
 
				-
			
 
				-        starpu_opencl_release_kernel(kernel);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-unsigned nblocks = 4;
			
 
				-uint32_t size = 4*1024*1024;
			
 
				-
			
 
				-starpu_data_handle sparse_matrix;
			
 
				-starpu_data_handle vector_in, vector_out;
			
 
				-
			
 
				-float *sparse_matrix_nzval;
			
 
				-uint32_t *sparse_matrix_colind;
			
 
				-uint32_t *sparse_matrix_rowptr;
			
 
				-
			
 
				-float *vector_in_ptr;
			
 
				-float *vector_out_ptr;
			
 
				-
			
 
				-static void parse_args(int argc, char **argv)
			
 
				-{
			
 
				-	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				-			char *argptr;
			
 
				-			size = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-			char *argptr;
			
 
				-			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void cpu_spmv(void *descr[], __attribute__((unused))  void *arg)
			
 
				-{
			
 
				-	float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
			
 
				-	uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
			
 
				-	uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
			
 
				-
			
 
				-	float *vecin = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-	float *vecout = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				-
			
 
				-	uint32_t firstelem = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
			
 
				-
			
 
				-	uint32_t nnz;
			
 
				-	uint32_t nrow;
			
 
				-
			
 
				-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
			
 
				-	nrow = STARPU_CSR_GET_NROW(descr[0]);
			
 
				-
			
 
				-	//STARPU_ASSERT(nrow == STARPU_VECTOR_GET_NX(descr[1]));
			
 
				-	STARPU_ASSERT(nrow == STARPU_VECTOR_GET_NX(descr[2]));
			
 
				-
			
 
				-	unsigned row;
			
 
				-	for (row = 0; row < nrow; row++)
			
 
				-	{
			
 
				-		float tmp = 0.0f;
			
 
				-		unsigned index;
			
 
				-
			
 
				-		unsigned firstindex = rowptr[row] - firstelem;
			
 
				-		unsigned lastindex = rowptr[row+1] - firstelem;
			
 
				-
			
 
				-		for (index = firstindex; index < lastindex; index++)
			
 
				-		{
			
 
				-			unsigned col;
			
 
				-
			
 
				-			col = colind[index];
			
 
				-			tmp += nzval[index]*vecin[col];
			
 
				-		}
			
 
				-
			
 
				-		vecout[row] = tmp;
			
 
				-	}
			
 
				-
			
 
				-}
			
 
				-
			
 
				-static void create_data(void)
			
 
				-{
			
 
				-	/* we need a sparse symetric (definite positive ?) matrix and a "dense" vector */
			
 
				-	
			
 
				-	/* example of 3-band matrix */
			
 
				-	float *nzval;
			
 
				-	uint32_t nnz;
			
 
				-	uint32_t *colind;
			
 
				-	uint32_t *rowptr;
			
 
				-
			
 
				-	nnz = 3*size-2;
			
 
				-
			
 
				-	nzval = malloc(nnz*sizeof(float));
			
 
				-	colind = malloc(nnz*sizeof(uint32_t));
			
 
				-	rowptr = malloc((size+1)*sizeof(uint32_t));
			
 
				-
			
 
				-	assert(nzval);
			
 
				-	assert(colind);
			
 
				-	assert(rowptr);
			
 
				-
			
 
				-	/* fill the matrix */
			
 
				-	unsigned row;
			
 
				-	unsigned pos = 0;
			
 
				-	for (row = 0; row < size; row++)
			
 
				-	{
			
 
				-		rowptr[row] = pos;
			
 
				-
			
 
				-		if (row > 0) {
			
 
				-			nzval[pos] = 1.0f;
			
 
				-			colind[pos] = row-1;
			
 
				-			pos++;
			
 
				-		}
			
 
				-		
			
 
				-		nzval[pos] = 5.0f;
			
 
				-		colind[pos] = row;
			
 
				-		pos++;
			
 
				-
			
 
				-		if (row < size - 1) {
			
 
				-			nzval[pos] = 1.0f;
			
 
				-			colind[pos] = row+1;
			
 
				-			pos++;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	STARPU_ASSERT(pos == nnz);
			
 
				-
			
 
				-	rowptr[size] = nnz;
			
 
				-	
			
 
				-	starpu_csr_data_register(&sparse_matrix, 0, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float));
			
 
				-
			
 
				-	sparse_matrix_nzval = nzval;
			
 
				-	sparse_matrix_colind = colind;
			
 
				-	sparse_matrix_rowptr = rowptr;
			
 
				-
			
 
				-	/* initiate the 2 vectors */
			
 
				-	float *invec, *outvec;
			
 
				-	invec = malloc(size*sizeof(float));
			
 
				-	assert(invec);
			
 
				-
			
 
				-	outvec = malloc(size*sizeof(float));
			
 
				-	assert(outvec);
			
 
				-
			
 
				-	/* fill those */
			
 
				-	unsigned ind;
			
 
				-	for (ind = 0; ind < size; ind++)
			
 
				-	{
			
 
				-		invec[ind] = 2.0f;
			
 
				-		outvec[ind] = 0.0f;
			
 
				-	}
			
 
				-
			
 
				-	starpu_vector_data_register(&vector_in, 0, (uintptr_t)invec, size, sizeof(float));
			
 
				-	starpu_vector_data_register(&vector_out, 0, (uintptr_t)outvec, size, sizeof(float));
			
 
				-
			
 
				-	vector_in_ptr = invec;
			
 
				-	vector_out_ptr = outvec;
			
 
				-
			
 
				-}
			
 
				-
			
 
				-void call_spmv_codelet_filters(void)
			
 
				-{
			
 
				-
			
 
				-	/* partition the data along a block distribution */
			
 
				-	struct starpu_data_filter csr_f, vector_f;
			
 
				-	csr_f.filter_func = starpu_vertical_block_filter_func_csr;
			
 
				-	csr_f.nchildren = nblocks;
			
 
				-	csr_f.get_nchildren = NULL;
			
 
				-	/* the children also use a csr interface */
			
 
				-	csr_f.get_child_ops = NULL;
			
 
				-
			
 
				-	vector_f.filter_func = starpu_block_filter_func_vector;
			
 
				-	vector_f.nchildren = nblocks;
			
 
				-	vector_f.get_nchildren = NULL;
			
 
				-	vector_f.get_child_ops = NULL;
			
 
				-
			
 
				-	starpu_data_partition(sparse_matrix, &csr_f);
			
 
				-	starpu_data_partition(vector_out, &vector_f);
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-        {
			
 
				-                int ret = starpu_opencl_load_opencl_from_file("examples/spmv/spmv_opencl.cl", &opencl_codelet);
			
 
				-                if (ret)
			
 
				-		{
			
 
				-			fprintf(stderr, "Failed to compile OpenCL codelet\n");
			
 
				-			exit(ret);
			
 
				-		}
			
 
				-        }
			
 
				-#endif
			
 
				-
			
 
				-	starpu_codelet cl = {};
			
 
				-
			
 
				-	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
			
 
				-	cl.cpu_func =  cpu_spmv;
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	cl.cuda_func = spmv_kernel_cuda;
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-        cl.opencl_func = spmv_kernel_opencl;
			
 
				-#endif
			
 
				-	cl.nbuffers = 3;
			
 
				-	cl.model = NULL;
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	unsigned part;
			
 
				-	for (part = 0; part < nblocks; part++)
			
 
				-	{
			
 
				-		struct starpu_task *task = starpu_task_create();
			
 
				-                int ret;
			
 
				-
			
 
				-		task->callback_func = NULL;
			
 
				-
			
 
				-		task->cl = &cl;
			
 
				-		task->cl_arg = NULL;
			
 
				-	
			
 
				-		task->buffers[0].handle = starpu_data_get_sub_data(sparse_matrix, 1, part);
			
 
				-		task->buffers[0].mode  = STARPU_R;
			
 
				-		task->buffers[1].handle = vector_in;
			
 
				-		task->buffers[1].mode = STARPU_R;
			
 
				-		task->buffers[2].handle = starpu_data_get_sub_data(vector_out, 1, part);
			
 
				-		task->buffers[2].mode = STARPU_W;
			
 
				-	
			
 
				-		ret = starpu_task_submit(task);
			
 
				-		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-		{
			
 
				-			fprintf(stderr, "No worker may execute this task\n");
			
 
				-			exit(0);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-
			
 
				-	starpu_data_unpartition(sparse_matrix, 0);
			
 
				-	starpu_data_unpartition(vector_out, 0);
			
 
				-}
			
 
				-
			
 
				-static void print_results(void)
			
 
				-{
			
 
				-	unsigned row;
			
 
				-
			
 
				-	for (row = 0; row < STARPU_MIN(size, 16); row++)
			
 
				-	{
			
 
				-		printf("%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int main(__attribute__ ((unused)) int argc,
			
 
				-	__attribute__ ((unused)) char **argv)
			
 
				-{
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	/* start the runtime */
			
 
				-	starpu_init(NULL);
			
 
				-
			
 
				-	/* create the sparse input matrix */
			
 
				-	create_data();
			
 
				-
			
 
				-	/* create a new codelet that will perform a SpMV on it */
			
 
				-	call_spmv_codelet_filters();
			
 
				-
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	print_results();
			
 
				-
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/spmv/dw_spmv.h
+++ b/examples/spmv/dw_spmv.h
@@ -1,32 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __DW_SPARSE_CG_H__
			
 
				-#define __DW_SPARSE_CG_H__
			
 
				-
			
 
				-#include <sys/types.h>
			
 
				-#include <semaphore.h>
			
 
				-#include <string.h>
			
 
				-#include <stdint.h>
			
 
				-#include <math.h>
			
 
				-#include <sys/time.h>
			
 
				-#include <pthread.h>
			
 
				-#include <signal.h>
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#endif // __DW_SPARSE_CG_H__