Andra Hugo 14 年之前
父節點
當前提交
ec83e69660

File diff suppressed because it is too large
+ 0 - 50
examples/pi/SobolQRNG/CforCUDA_SDK_license.txt


+ 0 - 60
examples/pi/SobolQRNG/sobol.h

@@ -1,60 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation and 
- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
- * of this software and related documentation without an express license 
- * agreement from NVIDIA Corporation is strictly prohibited.
- * 
- */
- 
- /*
- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
- *
- * Sobol Quasi-random Number Generator example
- *
- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
- * http://people.maths.ox.ac.uk/~gilesm/
- *
- * and C code developed by Stephen Joe, University of Waikato, New Zealand
- * and Frances Kuo, University of New South Wales, Australia
- * http://web.maths.unsw.edu.au/~fkuo/sobol/
- *
- * For theoretical background see:
- *
- * P. Bratley and B.L. Fox.
- * Implementing Sobol's quasirandom sequence generator
- * http://portal.acm.org/citation.cfm?id=42288
- * ACM Trans. on Math. Software, 14(1):88-100, 1988
- *
- * S. Joe and F. Kuo.
- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
- * http://portal.acm.org/citation.cfm?id=641879
- * ACM Trans. on Math. Software, 29(1):49-57, 2003
- */
-
-#ifndef SOBOL_H
-#define SOBOL_H
-
-// Number of direction vectors is fixed to 32
-#define n_directions 32
-
-#endif

+ 0 - 141
examples/pi/SobolQRNG/sobol_gold.c

@@ -1,141 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation and 
- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
- * of this software and related documentation without an express license 
- * agreement from NVIDIA Corporation is strictly prohibited.
- * 
- */
- 
- /*
- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
- *
- * Sobol Quasi-random Number Generator example
- *
- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
- * http://people.maths.ox.ac.uk/~gilesm/
- *
- * and C code developed by Stephen Joe, University of Waikato, New Zealand
- * and Frances Kuo, University of New South Wales, Australia
- * http://web.maths.unsw.edu.au/~fkuo/sobol/
- *
- * For theoretical background see:
- *
- * P. Bratley and B.L. Fox.
- * Implementing Sobol's quasirandom sequence generator
- * http://portal.acm.org/citation.cfm?id=42288
- * ACM Trans. on Math. Software, 14(1):88-100, 1988
- *
- * S. Joe and F. Kuo.
- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
- * http://portal.acm.org/citation.cfm?id=641879
- * ACM Trans. on Math. Software, 29(1):49-57, 2003
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-
-#include "sobol.h"
-#include "sobol_gold.h"
-#include "sobol_primitives.h"
-
-#define k_2powneg32 2.3283064E-10F
-
-// Create the direction numbers, based on the primitive polynomials.
-void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
-{
-    unsigned int *v = directions;
-
-    int dim;
-    for (dim = 0 ; dim < n_dimensions ; dim++)
-    {
-        // First dimension is a special case
-        if (dim == 0)
-        {
-            int i;
-            for (i = 0 ; i < n_directions ; i++)
-            {
-                // All m's are 1
-                v[i] = 1 << (31 - i);
-            }
-        }
-        else
-        {
-            int d = sobol_primitives[dim].degree;
-            // The first direction numbers (up to the degree of the polynomial)
-            // are simply v[i] = m[i] / 2^i (stored in Q0.32 format)
-            int i;
-            for (i = 0 ; i < d ; i++)
-            {
-                v[i] = sobol_primitives[dim].m[i] << (31 - i);
-            }
-            // The remaining direction numbers are computed as described in
-            // the Bratley and Fox paper.
-            // v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ v[i-d]/2^d
-            for (i = d ; i < n_directions ; i++)
-            {
-                // First do the v[i-d] ^ v[i-d]/2^d part
-                v[i] = v[i - d] ^ (v[i - d] >> d);
-                // Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part
-                // Note that the coefficients a[] are zero or one and for compactness in
-                // the input tables they are stored as bits of a single integer. To extract
-                // the relevant bit we use right shift and mask with 1.
-                // For example, for a 10 degree polynomial there are ten useful bits in a,
-                // so to get a[2] we need to right shift 7 times (to get the 8th bit into
-                // the LSB) and then mask with 1.
-                int j;
-                for (j = 1 ; j < d ; j++)
-                {
-                    v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]);
-                }
-            }
-        }
-        v += n_directions;
-    }
-}
-
-// Reference model for generating Sobol numbers on the host
-void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output)
-{
-    unsigned int *v = directions;
-
-    int d;
-    for (d = 0 ; d < n_dimensions ; d++)
-    {
-        unsigned int X = 0;
-        // x[0] is zero (in all dimensions)
-        output[n_vectors * d] = 0.0;        
-        int i;
-        for (i = 1 ; i < n_vectors ; i++)
-        {
-            // x[i] = x[i-1] ^ v[c]
-            //  where c is the index of the rightmost zero bit in i
-            //  minus 1 (since C arrays count from zero)
-            // In the Bratley and Fox paper this is equation (**)
-            X ^= v[ffs(~(i - 1)) - 1];
-            output[i + n_vectors * d] = (float)X * k_2powneg32;
-        }
-        v += n_directions;
-    }
-}

+ 0 - 61
examples/pi/SobolQRNG/sobol_gold.h

@@ -1,61 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation and 
- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
- * of this software and related documentation without an express license 
- * agreement from NVIDIA Corporation is strictly prohibited.
- * 
- */
- 
- /*
- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
- *
- * Sobol Quasi-random Number Generator example
- *
- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
- * http://people.maths.ox.ac.uk/~gilesm/
- *
- * and C code developed by Stephen Joe, University of Waikato, New Zealand
- * and Frances Kuo, University of New South Wales, Australia
- * http://web.maths.unsw.edu.au/~fkuo/sobol/
- *
- * For theoretical background see:
- *
- * P. Bratley and B.L. Fox.
- * Implementing Sobol's quasirandom sequence generator
- * http://portal.acm.org/citation.cfm?id=42288
- * ACM Trans. on Math. Software, 14(1):88-100, 1988
- *
- * S. Joe and F. Kuo.
- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
- * http://portal.acm.org/citation.cfm?id=641879
- * ACM Trans. on Math. Software, 29(1):49-57, 2003
- *
- */
-
-#ifndef SOBOL_GOLD_H
-#define SOBOL_GOLD_H
-
-void initSobolDirectionVectors(int n_dimensions, unsigned int *directions);
-void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output);
-
-#endif

+ 0 - 170
examples/pi/SobolQRNG/sobol_gpu.cu

@@ -1,170 +0,0 @@
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation and 
- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
- * of this software and related documentation without an express license 
- * agreement from NVIDIA Corporation is strictly prohibited.
- * 
- */
- 
- /*
- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
- *
- * Sobol Quasi-random Number Generator example
- *
- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
- * http://people.maths.ox.ac.uk/~gilesm/
- *
- * and C code developed by Stephen Joe, University of Waikato, New Zealand
- * and Frances Kuo, University of New South Wales, Australia
- * http://web.maths.unsw.edu.au/~fkuo/sobol/
- *
- * For theoretical background see:
- *
- * P. Bratley and B.L. Fox.
- * Implementing Sobol's quasirandom sequence generator
- * http://portal.acm.org/citation.cfm?id=42288
- * ACM Trans. on Math. Software, 14(1):88-100, 1988
- *
- * S. Joe and F. Kuo.
- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
- * http://portal.acm.org/citation.cfm?id=641879
- * ACM Trans. on Math. Software, 29(1):49-57, 2003
- *
- */
-
-#include "sobol.h"
-#include "sobol_gpu.h"
-#include <starpu.h>
-#include <starpu_cuda.h>
-
-#define k_2powneg32 2.3283064E-10F
-
-__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
-{
-    __shared__ unsigned int v[n_directions];
-
-    // Offset into the correct dimension as specified by the
-    // block y coordinate
-    d_directions = d_directions + n_directions * blockIdx.y;
-    d_output = d_output +  n_vectors * blockIdx.y;
-
-    // Copy the direction numbers for this dimension into shared
-    // memory - there are only 32 direction numbers so only the
-    // first 32 (n_directions) threads need participate.
-    if (threadIdx.x < n_directions)
-    {
-	    v[threadIdx.x] = d_directions[threadIdx.x];
-    }
-    __syncthreads();
-
-    // Set initial index (i.e. which vector this thread is
-    // computing first) and stride (i.e. step to the next vector
-    // for this thread)
-    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
-    int stride = gridDim.x * blockDim.x;
-
-    // Get the gray code of the index
-    // c.f. Numerical Recipes in C, chapter 20
-    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
-    unsigned int g = i0 ^ (i0 >> 1);
-
-    // Initialisation for first point x[i0]
-    // In the Bratley and Fox paper this is equation (*), where
-    // we are computing the value for x[n] without knowing the
-    // value of x[n-1].
-    unsigned int X = 0;
-    unsigned int mask;
-    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
-    {
-        // We want X ^= g_k * v[k], where g_k is one or zero.
-        // We do this by setting a mask with all bits equal to
-        // g_k. In reality we keep shifting g so that g_k is the
-        // LSB of g. This way we avoid multiplication.
-        mask = - (g & 1);
-        X ^= mask & v[k];
-        g = g >> 1;
-    }
-    if (i0 < n_vectors)
-    {
-        d_output[i0] = (float)X * k_2powneg32;
-    }
-
-    // Now do rest of points, using the stride
-    // Here we want to generate x[i] from x[i-stride] where we
-    // don't have any of the x in between, therefore we have to
-    // revisit the equation (**), this is easiest with an example
-    // so assume stride is 16.
-    // From x[n] to x[n+16] there will be:
-    //   8 changes in the first bit
-    //   4 changes in the second bit
-    //   2 changes in the third bit
-    //   1 change in the fourth
-    //   1 change in one of the remaining bits
-    //
-    // What this means is that in the equation:
-    //   x[n+1] = x[n] ^ v[p]
-    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
-    //   ...
-    // We will apply xor with v[1] eight times, v[2] four times,
-    // v[3] twice, v[4] once and one other direction number once.
-    // Since two xors cancel out, we can skip even applications
-    // and just apply xor with v[4] (i.e. log2(16)) and with
-    // the current applicable direction number.
-    // Note that all these indices count from 1, so we need to
-    // subtract 1 from them all to account for C arrays counting
-    // from zero.
-    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
-    unsigned int v_stridemask = stride - 1;
-    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
-    {
-        // x[i] = x[i-stride] ^ v[b] ^ v[c]
-        //  where b is log2(stride) minus 1 for C array indexing
-        //  where c is the index of the rightmost zero bit in i,
-        //  not including the bottom log2(stride) bits, minus 1
-        //  for C array indexing
-        // In the Bratley and Fox paper this is equation (**)
-        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
-        d_output[i] = (float)X * k_2powneg32;
-    }
-}
-
-extern "C"
-void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
-{
-    const int threadsperblock = 64;
-
-    // Set up the execution configuration
-    dim3 dimGrid;
-    dim3 dimBlock;
-
-    // This implementation of the generator outputs all the draws for
-    // one dimension in a contiguous region of memory, followed by the
-    // next dimension and so on.
-    // Therefore all threads within a block will be processing different
-    // vectors from the same dimension. As a result we want the total
-    // number of blocks to be a multiple of the number of dimensions.
-    dimGrid.y = n_dimensions;
-
-    // If the number of dimensions is large then we will set the number
-    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
-    // but if the number of dimensions is small (e.g. less than 32) then
-    // we'll partition the vectors across blocks (as well as threads).
-    // We also need to cap the dimGrid.x where the number of vectors
-    // is too small to be partitioned.
-    dimGrid.x = 1 + 31 / n_dimensions;
-    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
-    {
-        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
-    }
-    
-    // Fix the number of threads
-    dimBlock.x = threadsperblock;
-
-    // Execute GPU kernel
-    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
-}

+ 0 - 61
examples/pi/SobolQRNG/sobol_gpu.h

@@ -1,61 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation and 
- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
- * of this software and related documentation without an express license 
- * agreement from NVIDIA Corporation is strictly prohibited.
- * 
- */
- 
- /*
- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
- *
- * Sobol Quasi-random Number Generator example
- *
- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
- * http://people.maths.ox.ac.uk/~gilesm/
- *
- * and C code developed by Stephen Joe, University of Waikato, New Zealand
- * and Frances Kuo, University of New South Wales, Australia
- * http://web.maths.unsw.edu.au/~fkuo/sobol/
- *
- * For theoretical background see:
- *
- * P. Bratley and B.L. Fox.
- * Implementing Sobol's quasirandom sequence generator
- * http://portal.acm.org/citation.cfm?id=42288
- * ACM Trans. on Math. Software, 14(1):88-100, 1988
- *
- * S. Joe and F. Kuo.
- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
- * http://portal.acm.org/citation.cfm?id=641879
- * ACM Trans. on Math. Software, 29(1):49-57, 2003
- *
- */
-
-#ifndef SOBOL_GPU_H
-#define SOBOL_GPU_H
-
-extern "C"
-void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output);
-
-#endif

File diff suppressed because it is too large
+ 0 - 10271
examples/pi/SobolQRNG/sobol_primitives.c


+ 0 - 75
examples/pi/SobolQRNG/sobol_primitives.h

@@ -1,75 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-/*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
- *
- * NVIDIA Corporation and its licensors retain all intellectual property and 
- * proprietary rights in and to this software and related documentation and 
- * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
- * of this software and related documentation without an express license 
- * agreement from NVIDIA Corporation is strictly prohibited.
- * 
- */
- 
- /*
- * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
- * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
- * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
- *
- * Sobol Quasi-random Number Generator example
- *
- * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
- * http://people.maths.ox.ac.uk/~gilesm/
- *
- * and C code developed by Stephen Joe, University of Waikato, New Zealand
- * and Frances Kuo, University of New South Wales, Australia
- * http://web.maths.unsw.edu.au/~fkuo/sobol/
- *
- * For theoretical background see:
- *
- * P. Bratley and B.L. Fox.
- * Implementing Sobol's quasirandom sequence generator
- * http://portal.acm.org/citation.cfm?id=42288
- * ACM Trans. on Math. Software, 14(1):88-100, 1988
- *
- * S. Joe and F. Kuo.
- * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
- * http://portal.acm.org/citation.cfm?id=641879
- * ACM Trans. on Math. Software, 29(1):49-57, 2003
- *
- */
-
-#ifndef SOBOL_PRIMITIVES_H
-#define SOBOL_PRIMITIVES_H
-
-#define max_m 17
-
-// Each primitive is stored as a struct where
-//  dimension is the dimension number of the polynomial (unused)
-//  degree is the degree of the polynomial
-//  a is a binary word representing the coefficients 
-//  m is the array of m values
-struct primitive
-{
-    unsigned int dimension;
-    unsigned int degree;
-    unsigned int a;
-    unsigned int m[max_m];
-};
-
-extern const struct primitive sobol_primitives[];
-
-#endif

+ 0 - 177
examples/pi/pi.c

@@ -1,177 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "SobolQRNG/sobol.h"
-#include "SobolQRNG/sobol_gold.h"
-#include "pi.h"
-#include <sys/time.h>
-
-#ifdef STARPU_USE_CUDA
-void cuda_kernel(void **descr, void *cl_arg);
-#endif
-
-/* default value */
-static unsigned ntasks = 1024;
-
-static void cpu_kernel(void *descr[], void *cl_arg)
-{
-	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned nx = NSHOT_PER_TASK;
-
-	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
-	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
-
-	TYPE *random_numbers_x = &random_numbers[0];
-	TYPE *random_numbers_y = &random_numbers[nx];
-
-	unsigned current_cnt = 0;
-
-	unsigned i;
-	for (i = 0; i < nx; i++)
-	{
-		TYPE x = random_numbers_x[i];
-		TYPE y = random_numbers_y[i];
-
-		TYPE dist = (x*x + y*y);
-
-		unsigned success = (dist <= 1.0);
-		current_cnt += success;
-	}
-
-	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
-	*cnt = current_cnt;
-
-	free(random_numbers);
-}
-
-static void parse_args(int argc, char **argv)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-ntasks") == 0) {
-			char *argptr;
-			ntasks = strtol(argv[++i], &argptr, 10);
-		}
-	}
-}
-
-int main(int argc, char **argv)
-{
-	unsigned i;
-
-	parse_args(argc, argv);
-
-	starpu_init(NULL);
-
-	/* Initialize the random number generator */
-	unsigned *sobol_qrng_directions = malloc(n_dimensions*n_directions*sizeof(unsigned));
-	STARPU_ASSERT(sobol_qrng_directions);
-
-	initSobolDirectionVectors(n_dimensions, sobol_qrng_directions);
-
-	/* Any worker may use that array now */
-	starpu_data_handle sobol_qrng_direction_handle;
-	starpu_vector_data_register(&sobol_qrng_direction_handle, 0,
-		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
-
-	unsigned *cnt_array = malloc(ntasks*sizeof(unsigned));
-	STARPU_ASSERT(cnt_array);
-	starpu_data_handle cnt_array_handle;
-	starpu_vector_data_register(&cnt_array_handle, 0, (uintptr_t)cnt_array, ntasks, sizeof(unsigned));
-
-	/* Use a write-through policy : when the data is modified on an
-	 * accelerator, we know that it will only be modified once and be
-	 * accessed by the CPU later on */
-	starpu_data_set_wt_mask(cnt_array_handle, (1<<0));
-
-	struct starpu_data_filter f = {
-		.filter_func = starpu_block_filter_func_vector,
-		.nchildren = ntasks,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
-	};
-	
-	starpu_data_partition(cnt_array_handle, &f);
-
-	static struct starpu_perfmodel_t model = {
-		.type = STARPU_HISTORY_BASED,
-		.symbol = "monte_carlo_pi"
-	};
-
-	struct starpu_codelet_t cl = {
-		.where = STARPU_CPU|STARPU_CUDA,
-		.cpu_func = cpu_kernel,
-#ifdef STARPU_USE_CUDA
-		.cuda_func = cuda_kernel,
-#endif
-		.nbuffers = 2,
-		.model = &model
-	};
-
-	struct timeval start;
-	struct timeval end;
-
-	gettimeofday(&start, NULL);
-
-	for (i = 0; i < ntasks; i++)
-	{
-		struct starpu_task *task = starpu_task_create();
-
-		task->cl = &cl;
-
-		STARPU_ASSERT(starpu_data_get_sub_data(cnt_array_handle, 1, i));
-
-		task->buffers[0].handle = sobol_qrng_direction_handle;
-		task->buffers[0].mode   = STARPU_R;
-		task->buffers[1].handle = starpu_data_get_sub_data(cnt_array_handle, 1, i);
-		task->buffers[1].mode   = STARPU_W;
-
-		int ret = starpu_task_submit(task);
-		STARPU_ASSERT(!ret);
-	}
-
-	starpu_task_wait_for_all();
-
-	/* Get the cnt_array back in main memory */
-	starpu_data_unpartition(cnt_array_handle, 0);
-	starpu_data_acquire(cnt_array_handle, STARPU_RW);
-
-	/* Count the total number of entries */
-	unsigned long total_cnt = 0;
-	for (i = 0; i < ntasks; i++)
-		total_cnt += cnt_array[i];
-
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-	unsigned long total_shot_cnt = ntasks * NSHOT_PER_TASK;
-
-	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
-	fprintf(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(total_shot_cnt), total_cnt, total_shot_cnt);
-	fprintf(stderr, "Total time : %f ms\n", timing/1000.0);
-	fprintf(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
-
-	starpu_data_release(cnt_array_handle);
-
-	starpu_display_codelet_stats(&cl);
-
-	starpu_shutdown();
-
-	return 0;
-}

+ 0 - 33
examples/pi/pi.h

@@ -1,33 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __PI_H__
-#define __PI_H__
-
-#include <starpu.h>
-#include <starpu_cuda.h>
-#include <stdio.h>
-
-#define NSHOT_PER_TASK	(16*1024*1024ULL)
-
-#define TYPE	float
-
-//extern "C" void cuda_kernel(void *descr[], void *cl_arg);
-
-static int n_dimensions = 100;
-
-#endif // __PI_H__

+ 0 - 150
examples/pi/pi_kernel.cu

@@ -1,150 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "SobolQRNG/sobol_gpu.h"
-#include "pi.h"
-#include <starpu_cuda.h>
-
-#define MAXNBLOCKS	128
-#define MAXTHREADSPERBLOCK	256
-
-static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
-						unsigned n, unsigned *output_cnt)
-{
-	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
-
-	/* Do we have a successful shot ? */
-	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
-
-	const int nthreads = gridDim.x * blockDim.x;
-
-	/* Blank the shared mem buffer */
-	if (threadIdx.x < MAXTHREADSPERBLOCK)
-		scnt[threadIdx.x] = 0;
-
-	__syncthreads();
-	int ind;
-	for (ind = tid; ind < n; ind += nthreads)
-	{ 
-		TYPE x = random_numbers_x[ind];
-		TYPE y = random_numbers_y[ind];
-		TYPE dist = (x*x + y*y);
-
-		unsigned success = (dist <= 1.0f)?1:0;
-
-		scnt[threadIdx.x] += success;
-
-	}
-
-	__syncthreads();
-
-	/* Perform a reduction to compute the sum on each thread within that block */
-
-	/* NB: We assume that the number of threads per block is a power of 2 ! */
-	unsigned s;
-	for (s = blockDim.x/2; s!=0; s>>=1)
-	{
-		if (threadIdx.x < s)
-			scnt[threadIdx.x] += scnt[threadIdx.x + s];
-
-		__syncthreads();
-	}
-
-	/* report the number of successful shots in the block */
-	if (threadIdx.x == 0)
-		output_cnt[blockIdx.x] = scnt[0];
-
-	__syncthreads();
-}
-
-static __global__ void sum_per_block_cnt(unsigned *output_cnt, unsigned *cnt)
-{
-	__shared__ unsigned accumulator[MAXNBLOCKS];
-
-	unsigned i;
-
-	/* Load the values from global mem */
-	for (i = 0; i < blockDim.x; i++)
-		accumulator[i] = output_cnt[i];
-
-	__syncthreads();
-
-	/* Perform a reduction in shared memory */
-	unsigned s;
-	for (s = blockDim.x/2; s!=0; s>>=1)
-	{
-		if (threadIdx.x < s)
-			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
-
-		__syncthreads();
-	}
-
-	/* Save the result in global memory */
-	if (threadIdx.x == 0)
-		*cnt = accumulator[0];
-}
-
-extern "C" void cuda_kernel(void *descr[], void *cl_arg)
-{
-	cudaError_t cures;
-
-	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned nx = NSHOT_PER_TASK;
-
-	/* Generate Random numbers */
-	float *random_numbers;
-	cudaMalloc((void **)&random_numbers, 2*nx*sizeof(float));
-	STARPU_ASSERT(random_numbers);
-	
-	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
-	TYPE *random_numbers_x = &random_numbers[0];
-	TYPE *random_numbers_y = &random_numbers[nx];
-
-	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
-
-	/* How many blocks do we use ? */ 
-	unsigned nblocks = 128; // TODO
-
-	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
-	
-	unsigned *per_block_cnt;
-	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned));
-
-	STARPU_ASSERT((nx % nblocks) == 0);
-
-	/* How many threads per block ? At most 256, but no more threads than
-	 * there are entries to process per block. */
-	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
-
-	/* each entry of per_block_cnt contains the number of successful shots
-	 * in the corresponding block. */
-	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
-
-	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
-
-	/* compute the total number of successful shots by adding the elements
-	 * of the per_block_cnt array */
-	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
-	if (cures)
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cudaFree(per_block_cnt);
-	cudaFree(random_numbers);
-}

+ 0 - 318
examples/pi/pi_redux.c

@@ -1,318 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <starpu_config.h>
-
-#define PI	3.14159265358979323846
-
-#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CURAND)
-#warning CURAND is required to run that example on CUDA devices
-#endif
-
-#ifdef STARPU_HAVE_CURAND
-#include <cuda.h>
-#include <curand.h>
-#include <starpu_cuda.h>
-#endif
-
-#define NSHOT_PER_TASK	(1024*1024)
-
-/* default value */
-static unsigned long ntasks = 1024;
-
-/*
- *	Initialization of the Random Number Generators (RNG)
- */
-
-#ifdef STARPU_HAVE_CURAND
-/* RNG for the CURAND library */
-static curandGenerator_t curandgens[STARPU_NMAXWORKERS];
-#endif 
-
-/* state for the erand48 function */
-static unsigned short xsubi[3*STARPU_NMAXWORKERS];
-
-/* Function to initialize the random number generator in the current worker */
-static void init_rng(void *arg __attribute__((unused)))
-{
-#ifdef STARPU_HAVE_CURAND
-	curandStatus_t res;
-#endif
-
-	int workerid = starpu_worker_get_id();
-
-	switch (starpu_worker_get_type(workerid)) {
-		case STARPU_CPU_WORKER:
-			/* create a seed */
-			xsubi[0 + 3*workerid] = (unsigned short)workerid;
-			xsubi[1 + 3*workerid] = (unsigned short)workerid;
-			xsubi[2 + 3*workerid] = (unsigned short)workerid;
-			break;
-#ifdef STARPU_HAVE_CURAND
-		case STARPU_CUDA_WORKER:
-
-			/* Create a RNG */
-			res = curandCreateGenerator(&curandgens[workerid],
-						CURAND_RNG_PSEUDO_DEFAULT);
-			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
-
-			/* Seed it with worker's id */
-			res = curandSetPseudoRandomGeneratorSeed(curandgens[workerid],
-							(unsigned long long)workerid);
-			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-static void parse_args(int argc, char **argv)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-ntasks") == 0) {
-			char *argptr;
-			ntasks = strtol(argv[++i], &argptr, 10);
-		}
-	}
-}
-
-/*
- *	Monte-carlo kernel
- */
-
-static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
-{
-	int workerid = starpu_worker_get_id();
-
-	unsigned short *worker_xsub;
-	worker_xsub = &xsubi[3*workerid];
-
-	unsigned long local_cnt = 0;
-
-	/* Fill the scratchpad with random numbers */
-	int i;
-	for (i = 0; i < NSHOT_PER_TASK; i++)
-	{
-		float x = (float)(2.0*starpu_erand48(worker_xsub) - 1.0);
-		float y = (float)(2.0*starpu_erand48(worker_xsub) - 1.0);
-
-		float dist = x*x + y*y;
-		if (dist < 1.0)
-			local_cnt++;
-	}
-
-	/* Put the contribution of that task into the counter */
-	unsigned long *cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
-	*cnt = *cnt + local_cnt;
-}
-
-extern void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt);
-
-#ifdef STARPU_HAVE_CURAND
-static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
-{
-	cudaError_t cures;
-	curandStatus_t res;	
-
-	int workerid = starpu_worker_get_id();
-
-	/* CURAND is a bit silly: it assumes that any error is fatal. Calling
-	 * cudaGetLastError resets the last error value. */
-	cures = cudaGetLastError();
-//	if (cures)
-//		STARPU_CUDA_REPORT_ERROR(cures);
-
-	/* Fill the scratchpad with random numbers. Note that both x and y
-	 * arrays are in stored the same vector. */
-	float *scratchpad_xy = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
-	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*NSHOT_PER_TASK);
-	STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
-
-	float *x = &scratchpad_xy[0];
-	float *y = &scratchpad_xy[NSHOT_PER_TASK];
-
-	unsigned long *shot_cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
-	pi_redux_cuda_kernel(x, y, NSHOT_PER_TASK, shot_cnt);
-}
-#endif
-
-static struct starpu_codelet_t pi_cl = {
-	.where =
-#ifdef STARPU_HAVE_CURAND
-		STARPU_CUDA|
-#endif
-		STARPU_CPU,
-	.cpu_func = pi_func_cpu,
-#ifdef STARPU_HAVE_CURAND
-	.cuda_func = pi_func_cuda,
-#endif
-	.nbuffers = 2,
-	.model = NULL
-};
-
-/*
- *	Codelets to implement reduction
- */
-
-static void init_cpu_func(void *descr[], void *cl_arg)
-{
-        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
-        *val = 0;
-}
-
-#ifdef STARPU_HAVE_CURAND
-static void init_cuda_func(void *descr[], void *cl_arg)
-{
-        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
-        cudaMemset(val, 0, sizeof(unsigned long));
-        cudaThreadSynchronize();
-}
-#endif
-
-static struct starpu_codelet_t init_codelet = {
-	.where =
-#ifdef STARPU_HAVE_CURAND
-		STARPU_CUDA|
-#endif
-		STARPU_CPU,
-        .cpu_func = init_cpu_func,
-#ifdef STARPU_HAVE_CURAND
-        .cuda_func = init_cuda_func,
-#endif
-        .nbuffers = 1
-};
-
-#ifdef STARPU_HAVE_CURAND
-/* Dummy implementation of the addition of two unsigned longs in CUDA */
-static void redux_cuda_func(void *descr[], void *cl_arg)
-{
-	unsigned long *d_a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	unsigned long *d_b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
-
-	unsigned long h_a, h_b;
-	
-	cudaMemcpy(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost);
-	cudaMemcpy(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost);
-
-	h_a += h_b;
-
-	cudaMemcpy(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice);
-};
-#endif
-
-static void redux_cpu_func(void *descr[], void *cl_arg)
-{
-	unsigned long *a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
-
-	*a = *a + *b;
-};
-
-static struct starpu_codelet_t redux_codelet = {
-	.where =
-#ifdef STARPU_HAVE_CURAND
-		STARPU_CUDA|
-#endif
-		STARPU_CPU,
-	.cpu_func = redux_cpu_func,
-#ifdef STARPU_HAVE_CURAND
-	.cuda_func = redux_cuda_func,
-#endif
-	.nbuffers = 2
-};
-
-/*
- *	Main program
- */
-
-int main(int argc, char **argv)
-{
-	unsigned i;
-
-	parse_args(argc, argv);
-
-	starpu_init(NULL);
-
-	/* Launch a Random Number Generator (RNG) on each worker */
-	starpu_execute_on_each_worker(init_rng, NULL, STARPU_CPU|STARPU_CUDA);
-
-	/* Create a scratchpad data */
-	starpu_data_handle xy_scratchpad_handle;
-	starpu_vector_data_register(&xy_scratchpad_handle, -1, (uintptr_t)NULL,
-		2*NSHOT_PER_TASK, sizeof(float));
-
-	/* Create a variable that will be used to count the number of shots
-	 * that actually hit the unit circle when shooting randomly in
-	 * [-1,1]^2. */
-	unsigned long shot_cnt = 0;
-	starpu_data_handle shot_cnt_handle;
-	starpu_variable_data_register(&shot_cnt_handle, 0,
-			(uintptr_t)&shot_cnt, sizeof(shot_cnt));
-
-	starpu_data_set_reduction_methods(shot_cnt_handle,
-					&redux_codelet, &init_codelet);
-
-	struct timeval start;
-	struct timeval end;
-
-	gettimeofday(&start, NULL);
-
-	for (i = 0; i < ntasks; i++)
-	{
-		struct starpu_task *task = starpu_task_create();
-
-		task->cl = &pi_cl;
-
-		task->buffers[0].handle = xy_scratchpad_handle;
-		task->buffers[0].mode   = STARPU_SCRATCH;
-		task->buffers[1].handle = shot_cnt_handle;
-		task->buffers[1].mode   = STARPU_REDUX;
-
-		int ret = starpu_task_submit(task);
-		STARPU_ASSERT(!ret);
-	}
-
-	starpu_data_acquire(shot_cnt_handle, STARPU_R);
-
-	gettimeofday(&end, NULL);
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
-	 * probability to impact the disk: pi/4 */
-	unsigned long total = ntasks*NSHOT_PER_TASK;
-	double pi_approx = ((double)shot_cnt*4.0)/total;
-
-	starpu_data_release(shot_cnt_handle);
-
-
-	fprintf(stderr, "Pi approximation : %lf (%ld / %ld)\n", pi_approx, shot_cnt, total);
-	fprintf(stderr, "Error %le \n", pi_approx - PI);
-	fprintf(stderr, "Total time : %f ms\n", timing/1000.0);
-	fprintf(stderr, "Speed : %f GShot/s\n", total/(1e3*timing));
-
-	starpu_data_unregister(shot_cnt_handle);
-	starpu_shutdown();
-
-	if (abs(pi_approx - PI) > 1.0)
-		return 1;
-
-	return 0;
-}

+ 0 - 128
examples/pi/pi_redux_kernel.cu

@@ -1,128 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-#include <starpu_cuda.h>
-
-#define MAXNBLOCKS	128
-#define MAXTHREADSPERBLOCK	256
-
-static __global__ void monte_carlo(float *x, float *y, unsigned n, unsigned long *output_cnt)
-{
-	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
-
-	/* Do we have a successful shot ? */
-	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
-
-	const int nthreads = gridDim.x * blockDim.x;
-
-	/* Blank the shared mem buffer */
-	if (threadIdx.x < MAXTHREADSPERBLOCK)
-		scnt[threadIdx.x] = 0;
-
-	__syncthreads();
-	int ind;
-	for (ind = tid; ind < n; ind += nthreads)
-	{ 
-		float xval = (2.0f * x[ind] - 1.0f);
-		float yval = (2.0f * y[ind] - 1.0f);
-		float dist = (xval*xval + yval*yval);
-
-		unsigned long success = (dist <= 1.0f)?1:0;
-
-		scnt[threadIdx.x] += success;
-
-	}
-
-	__syncthreads();
-
-	/* Perform a reduction to compute the sum on each thread within that block */
-
-	/* NB: We assume that the number of threads per block is a power of 2 ! */
-	unsigned long s;
-	for (s = blockDim.x/2; s!=0; s>>=1)
-	{
-		if (threadIdx.x < s)
-			scnt[threadIdx.x] += scnt[threadIdx.x + s];
-
-		__syncthreads();
-	}
-
-	/* report the number of successful shots in the block */
-	if (threadIdx.x == 0)
-		output_cnt[blockIdx.x] = scnt[0];
-
-	__syncthreads();
-}
-
-static __global__ void sum_per_block_cnt(unsigned long *output_cnt, unsigned long *cnt)
-{
-	__shared__ unsigned long accumulator[MAXNBLOCKS];
-
-	unsigned i;
-
-	/* Load the values from global mem */
-	for (i = 0; i < blockDim.x; i++)
-		accumulator[i] = output_cnt[i];
-
-	__syncthreads();
-
-	/* Perform a reduction in shared memory */
-	unsigned s;
-	for (s = blockDim.x/2; s!=0; s>>=1)
-	{
-		if (threadIdx.x < s)
-			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
-
-		__syncthreads();
-	}
-
-	/* Save the result in global memory */
-	if (threadIdx.x == 0)
-		*cnt = *cnt + accumulator[0];
-}
-
-extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt)
-{
-	cudaError_t cures;
-
-	/* How many blocks do we use ? */ 
-	unsigned nblocks = 128; // TODO
-	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
-	STARPU_ASSERT((n % nblocks) == 0);
-	
-	unsigned long *per_block_cnt;
-	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned long));
-
-	/* How many threads per block ? At most 256, but no more threads than
-	 * there are entries to process per block. */
-	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (n / nblocks));
-
-	/* each entry of per_block_cnt contains the number of successful shots
-	 * in the corresponding block. */
-	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
-
-	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
-
-	/* compute the total number of successful shots by adding the elements
-	 * of the per_block_cnt array */
-	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
-	if (cures)
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cudaFree(per_block_cnt);
-}

+ 0 - 350
examples/spmv/dw_spmv.c

@@ -1,350 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*
- * Conjugate gradients for Sparse matrices
- */
-
-#include "dw_spmv.h"
-
-#ifdef STARPU_USE_CUDA
-extern void spmv_kernel_cuda(void *descr[], void *args);
-#endif
-
-struct timeval start;
-struct timeval end;
-
-#ifdef STARPU_USE_OPENCL
-#include "starpu_opencl.h"
-struct starpu_opencl_program opencl_codelet;
-void spmv_kernel_opencl(void *descr[], void *args)
-{
-	cl_kernel kernel;
-	cl_command_queue queue;
-	cl_event event;
-	int id, devid, err, n;
-
-	uint32_t nnz = STARPU_CSR_GET_NNZ(descr[0]);
-	uint32_t nrow = STARPU_CSR_GET_NROW(descr[0]);
-	float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
-	uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
-	uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
-	uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
-
-	float *vecin = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
-	uint32_t nx_in = STARPU_VECTOR_GET_NX(descr[1]);
-
-	float *vecout = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
-	uint32_t nx_out = STARPU_VECTOR_GET_NX(descr[2]);
-
-        id = starpu_worker_get_id();
-        devid = starpu_worker_get_devid(id);
-
-        err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_codelet, "spvm", devid);
-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = 0;
-        n=0;
-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nnz);
-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nrow);
-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &nzval);
-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &colind);
-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &rowptr);
-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &firstentry);
-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecin);
-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_in);
-	err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &vecout);
-	err = clSetKernelArg(kernel, n++, sizeof(uint32_t), &nx_out);
-        if (err) STARPU_OPENCL_REPORT_ERROR(err);
-
-	{
-                size_t global=1024;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
-		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
-        starpu_opencl_release_kernel(kernel);
-}
-#endif
-
-unsigned nblocks = 4;
-uint32_t size = 4*1024*1024;
-
-starpu_data_handle sparse_matrix;
-starpu_data_handle vector_in, vector_out;
-
-float *sparse_matrix_nzval;
-uint32_t *sparse_matrix_colind;
-uint32_t *sparse_matrix_rowptr;
-
-float *vector_in_ptr;
-float *vector_out_ptr;
-
-static void parse_args(int argc, char **argv)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-size") == 0) {
-			char *argptr;
-			size = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocks") == 0) {
-			char *argptr;
-			nblocks = strtol(argv[++i], &argptr, 10);
-		}
-	}
-}
-
-static void cpu_spmv(void *descr[], __attribute__((unused))  void *arg)
-{
-	float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
-	uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
-	uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
-
-	float *vecin = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
-	float *vecout = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
-
-	uint32_t firstelem = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
-
-	uint32_t nnz;
-	uint32_t nrow;
-
-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
-	nrow = STARPU_CSR_GET_NROW(descr[0]);
-
-	//STARPU_ASSERT(nrow == STARPU_VECTOR_GET_NX(descr[1]));
-	STARPU_ASSERT(nrow == STARPU_VECTOR_GET_NX(descr[2]));
-
-	unsigned row;
-	for (row = 0; row < nrow; row++)
-	{
-		float tmp = 0.0f;
-		unsigned index;
-
-		unsigned firstindex = rowptr[row] - firstelem;
-		unsigned lastindex = rowptr[row+1] - firstelem;
-
-		for (index = firstindex; index < lastindex; index++)
-		{
-			unsigned col;
-
-			col = colind[index];
-			tmp += nzval[index]*vecin[col];
-		}
-
-		vecout[row] = tmp;
-	}
-
-}
-
-static void create_data(void)
-{
-	/* we need a sparse symetric (definite positive ?) matrix and a "dense" vector */
-	
-	/* example of 3-band matrix */
-	float *nzval;
-	uint32_t nnz;
-	uint32_t *colind;
-	uint32_t *rowptr;
-
-	nnz = 3*size-2;
-
-	nzval = malloc(nnz*sizeof(float));
-	colind = malloc(nnz*sizeof(uint32_t));
-	rowptr = malloc((size+1)*sizeof(uint32_t));
-
-	assert(nzval);
-	assert(colind);
-	assert(rowptr);
-
-	/* fill the matrix */
-	unsigned row;
-	unsigned pos = 0;
-	for (row = 0; row < size; row++)
-	{
-		rowptr[row] = pos;
-
-		if (row > 0) {
-			nzval[pos] = 1.0f;
-			colind[pos] = row-1;
-			pos++;
-		}
-		
-		nzval[pos] = 5.0f;
-		colind[pos] = row;
-		pos++;
-
-		if (row < size - 1) {
-			nzval[pos] = 1.0f;
-			colind[pos] = row+1;
-			pos++;
-		}
-	}
-
-	STARPU_ASSERT(pos == nnz);
-
-	rowptr[size] = nnz;
-	
-	starpu_csr_data_register(&sparse_matrix, 0, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float));
-
-	sparse_matrix_nzval = nzval;
-	sparse_matrix_colind = colind;
-	sparse_matrix_rowptr = rowptr;
-
-	/* initiate the 2 vectors */
-	float *invec, *outvec;
-	invec = malloc(size*sizeof(float));
-	assert(invec);
-
-	outvec = malloc(size*sizeof(float));
-	assert(outvec);
-
-	/* fill those */
-	unsigned ind;
-	for (ind = 0; ind < size; ind++)
-	{
-		invec[ind] = 2.0f;
-		outvec[ind] = 0.0f;
-	}
-
-	starpu_vector_data_register(&vector_in, 0, (uintptr_t)invec, size, sizeof(float));
-	starpu_vector_data_register(&vector_out, 0, (uintptr_t)outvec, size, sizeof(float));
-
-	vector_in_ptr = invec;
-	vector_out_ptr = outvec;
-
-}
-
-void call_spmv_codelet_filters(void)
-{
-
-	/* partition the data along a block distribution */
-	struct starpu_data_filter csr_f, vector_f;
-	csr_f.filter_func = starpu_vertical_block_filter_func_csr;
-	csr_f.nchildren = nblocks;
-	csr_f.get_nchildren = NULL;
-	/* the children also use a csr interface */
-	csr_f.get_child_ops = NULL;
-
-	vector_f.filter_func = starpu_block_filter_func_vector;
-	vector_f.nchildren = nblocks;
-	vector_f.get_nchildren = NULL;
-	vector_f.get_child_ops = NULL;
-
-	starpu_data_partition(sparse_matrix, &csr_f);
-	starpu_data_partition(vector_out, &vector_f);
-
-#ifdef STARPU_USE_OPENCL
-        {
-                int ret = starpu_opencl_load_opencl_from_file("examples/spmv/spmv_opencl.cl", &opencl_codelet);
-                if (ret)
-		{
-			fprintf(stderr, "Failed to compile OpenCL codelet\n");
-			exit(ret);
-		}
-        }
-#endif
-
-	starpu_codelet cl = {};
-
-	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
-	cl.cpu_func =  cpu_spmv;
-#ifdef STARPU_USE_CUDA
-	cl.cuda_func = spmv_kernel_cuda;
-#endif
-#ifdef STARPU_USE_OPENCL
-        cl.opencl_func = spmv_kernel_opencl;
-#endif
-	cl.nbuffers = 3;
-	cl.model = NULL;
-
-	gettimeofday(&start, NULL);
-
-	unsigned part;
-	for (part = 0; part < nblocks; part++)
-	{
-		struct starpu_task *task = starpu_task_create();
-                int ret;
-
-		task->callback_func = NULL;
-
-		task->cl = &cl;
-		task->cl_arg = NULL;
-	
-		task->buffers[0].handle = starpu_data_get_sub_data(sparse_matrix, 1, part);
-		task->buffers[0].mode  = STARPU_R;
-		task->buffers[1].handle = vector_in;
-		task->buffers[1].mode = STARPU_R;
-		task->buffers[2].handle = starpu_data_get_sub_data(vector_out, 1, part);
-		task->buffers[2].mode = STARPU_W;
-	
-		ret = starpu_task_submit(task);
-		if (STARPU_UNLIKELY(ret == -ENODEV))
-		{
-			fprintf(stderr, "No worker may execute this task\n");
-			exit(0);
-		}
-	}
-
-	starpu_task_wait_for_all();
-
-	gettimeofday(&end, NULL);
-
-	starpu_data_unpartition(sparse_matrix, 0);
-	starpu_data_unpartition(vector_out, 0);
-}
-
-static void print_results(void)
-{
-	unsigned row;
-
-	for (row = 0; row < STARPU_MIN(size, 16); row++)
-	{
-		printf("%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]);
-	}
-}
-
-int main(__attribute__ ((unused)) int argc,
-	__attribute__ ((unused)) char **argv)
-{
-	parse_args(argc, argv);
-
-	/* start the runtime */
-	starpu_init(NULL);
-
-	/* create the sparse input matrix */
-	create_data();
-
-	/* create a new codelet that will perform a SpMV on it */
-	call_spmv_codelet_filters();
-
-	starpu_shutdown();
-
-	print_results();
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
-
-	return 0;
-}

+ 0 - 32
examples/spmv/dw_spmv.h

@@ -1,32 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __DW_SPARSE_CG_H__
-#define __DW_SPARSE_CG_H__
-
-#include <sys/types.h>
-#include <semaphore.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include <signal.h>
-
-#include <starpu.h>
-
-#endif // __DW_SPARSE_CG_H__