Selaa lähdekoodia

- Rather than generating the random numbers on the host (which introduces a
major bottleneck), we generate the random number in the codelet directly.
- We use the Sobol pseudo-random number generator included in NVIDIA 3.0-beta
SDK (SobolQRNG).

Cédric Augonnet 15 vuotta sitten
vanhempi
commit
6b2bcd48a5

+ 10 - 2
examples/Makefile.am

@@ -98,6 +98,11 @@ noinst_HEADERS = 				\
 	lu/xlu_kernels.h			\
 	lu/float.h				\
 	lu/double.h				\
+	pi/pi.h					\
+	pi/SobolQRNG/sobol.h			\
+	pi/SobolQRNG/sobol_gold.h		\
+	pi/SobolQRNG/sobol_gpu.h		\
+	pi/SobolQRNG/sobol_primitives.h		\
 	cholesky/dw_cholesky_models.h		\
 	cholesky/dw_cholesky.h			\
 	common/blas_model.h			\
@@ -163,11 +168,14 @@ examplebin_PROGRAMS +=				\
 	pi/pi
 
 pi_pi_SOURCES =					\
-	pi/pi.c
+	pi/pi.c					\
+	pi/SobolQRNG/sobol_gold.c		\
+	pi/SobolQRNG/sobol_primitives.c
 
 if STARPU_USE_CUDA
 pi_pi_SOURCES +=				\
-	pi/pi_kernel.cu
+	pi/pi_kernel.cu				\
+	pi/SobolQRNG/sobol_gpu.cu
 endif
 
 

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 50 - 0
examples/pi/SobolQRNG/CforCUDA_SDK_license.txt


+ 45 - 0
examples/pi/SobolQRNG/sobol.h

@@ -0,0 +1,45 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ */
+
+#ifndef SOBOL_H
+#define SOBOL_H
+
+// Number of direction vectors is fixed to 32
+#define n_directions 32
+
+#endif

+ 126 - 0
examples/pi/SobolQRNG/sobol_gold.c

@@ -0,0 +1,126 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "sobol.h"
+#include "sobol_gold.h"
+#include "sobol_primitives.h"
+
+#define k_2powneg32 2.3283064E-10F
+
+// Create the direction numbers, based on the primitive polynomials.
+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
+{
+    unsigned int *v = directions;
+
+    int dim;
+    for (dim = 0 ; dim < n_dimensions ; dim++)
+    {
+        // First dimension is a special case
+        if (dim == 0)
+        {
+            int i;
+            for (i = 0 ; i < n_directions ; i++)
+            {
+                // All m's are 1
+                v[i] = 1 << (31 - i);
+            }
+        }
+        else
+        {
+            int d = sobol_primitives[dim].degree;
+            // The first direction numbers (up to the degree of the polynomial)
+            // are simply v[i] = m[i] / 2^i (stored in Q0.32 format)
+            int i;
+            for (i = 0 ; i < d ; i++)
+            {
+                v[i] = sobol_primitives[dim].m[i] << (31 - i);
+            }
+            // The remaining direction numbers are computed as described in
+            // the Bratley and Fox paper.
+            // v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ v[i-d]/2^d
+            for (i = d ; i < n_directions ; i++)
+            {
+                // First do the v[i-d] ^ v[i-d]/2^d part
+                v[i] = v[i - d] ^ (v[i - d] >> d);
+                // Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part
+                // Note that the coefficients a[] are zero or one and for compactness in
+                // the input tables they are stored as bits of a single integer. To extract
+                // the relevant bit we use right shift and mask with 1.
+                // For example, for a 10 degree polynomial there are ten useful bits in a,
+                // so to get a[2] we need to right shift 7 times (to get the 8th bit into
+                // the LSB) and then mask with 1.
+                int j;
+                for (j = 1 ; j < d ; j++)
+                {
+                    v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]);
+                }
+            }
+        }
+        v += n_directions;
+    }
+}
+
+// Reference model for generating Sobol numbers on the host
+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output)
+{
+    unsigned int *v = directions;
+
+    int d;
+    for (d = 0 ; d < n_dimensions ; d++)
+    {
+        unsigned int X = 0;
+        // x[0] is zero (in all dimensions)
+        output[n_vectors * d] = 0.0;        
+        int i;
+        for (i = 1 ; i < n_vectors ; i++)
+        {
+            // x[i] = x[i-1] ^ v[c]
+            //  where c is the index of the rightmost zero bit in i
+            //  minus 1 (since C arrays count from zero)
+            // In the Bratley and Fox paper this is equation (**)
+            X ^= v[ffs(~(i - 1)) - 1];
+            output[i + n_vectors * d] = (float)X * k_2powneg32;
+        }
+        v += n_directions;
+    }
+}

+ 46 - 0
examples/pi/SobolQRNG/sobol_gold.h

@@ -0,0 +1,46 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_GOLD_H
+#define SOBOL_GOLD_H
+
+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions);
+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output);
+
+#endif

+ 168 - 0
examples/pi/SobolQRNG/sobol_gpu.cu

@@ -0,0 +1,168 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#include "sobol.h"
+#include "sobol_gpu.h"
+
+#define k_2powneg32 2.3283064E-10F
+
+__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
+{
+    __shared__ unsigned int v[n_directions];
+
+    // Offset into the correct dimension as specified by the
+    // block y coordinate
+    d_directions = d_directions + n_directions * blockIdx.y;
+    d_output = d_output +  n_vectors * blockIdx.y;
+
+    // Copy the direction numbers for this dimension into shared
+    // memory - there are only 32 direction numbers so only the
+    // first 32 (n_directions) threads need participate.
+    if (threadIdx.x < n_directions)
+    {
+	    v[threadIdx.x] = d_directions[threadIdx.x];
+    }
+    __syncthreads();
+
+    // Set initial index (i.e. which vector this thread is
+    // computing first) and stride (i.e. step to the next vector
+    // for this thread)
+    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = gridDim.x * blockDim.x;
+
+    // Get the gray code of the index
+    // c.f. Numerical Recipes in C, chapter 20
+    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
+    unsigned int g = i0 ^ (i0 >> 1);
+
+    // Initialisation for first point x[i0]
+    // In the Bratley and Fox paper this is equation (*), where
+    // we are computing the value for x[n] without knowing the
+    // value of x[n-1].
+    unsigned int X = 0;
+    unsigned int mask;
+    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
+    {
+        // We want X ^= g_k * v[k], where g_k is one or zero.
+        // We do this by setting a mask with all bits equal to
+        // g_k. In reality we keep shifting g so that g_k is the
+        // LSB of g. This way we avoid multiplication.
+        mask = - (g & 1);
+        X ^= mask & v[k];
+        g = g >> 1;
+    }
+    if (i0 < n_vectors)
+    {
+        d_output[i0] = (float)X * k_2powneg32;
+    }
+
+    // Now do rest of points, using the stride
+    // Here we want to generate x[i] from x[i-stride] where we
+    // don't have any of the x in between, therefore we have to
+    // revisit the equation (**), this is easiest with an example
+    // so assume stride is 16.
+    // From x[n] to x[n+16] there will be:
+    //   8 changes in the first bit
+    //   4 changes in the second bit
+    //   2 changes in the third bit
+    //   1 change in the fourth
+    //   1 change in one of the remaining bits
+    //
+    // What this means is that in the equation:
+    //   x[n+1] = x[n] ^ v[p]
+    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
+    //   ...
+    // We will apply xor with v[1] eight times, v[2] four times,
+    // v[3] twice, v[4] once and one other direction number once.
+    // Since two xors cancel out, we can skip even applications
+    // and just apply xor with v[4] (i.e. log2(16)) and with
+    // the current applicable direction number.
+    // Note that all these indices count from 1, so we need to
+    // subtract 1 from them all to account for C arrays counting
+    // from zero.
+    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
+    unsigned int v_stridemask = stride - 1;
+    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
+    {
+        // x[i] = x[i-stride] ^ v[b] ^ v[c]
+        //  where b is log2(stride) minus 1 for C array indexing
+        //  where c is the index of the rightmost zero bit in i,
+        //  not including the bottom log2(stride) bits, minus 1
+        //  for C array indexing
+        // In the Bratley and Fox paper this is equation (**)
+        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
+        d_output[i] = (float)X * k_2powneg32;
+    }
+}
+
+extern "C"
+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
+{
+    const int threadsperblock = 64;
+
+    // Set up the execution configuration
+    dim3 dimGrid;
+    dim3 dimBlock;
+
+    // This implementation of the generator outputs all the draws for
+    // one dimension in a contiguous region of memory, followed by the
+    // next dimension and so on.
+    // Therefore all threads within a block will be processing different
+    // vectors from the same dimension. As a result we want the total
+    // number of blocks to be a multiple of the number of dimensions.
+    dimGrid.y = n_dimensions;
+
+    // If the number of dimensions is large then we will set the number
+    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
+    // but if the number of dimensions is small (e.g. less than 32) then
+    // we'll partition the vectors across blocks (as well as threads).
+    // We also need to cap the dimGrid.x where the number of vectors
+    // is too small to be partitioned.
+    dimGrid.x = 1 + 31 / n_dimensions;
+    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
+    {
+        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
+    }
+    
+    // Fix the number of threads
+    dimBlock.x = threadsperblock;
+
+    // Execute GPU kernel
+    sobolGPU_kernel<<<dimGrid, dimBlock>>>(n_vectors, n_dimensions, d_directions, d_output);
+}

+ 46 - 0
examples/pi/SobolQRNG/sobol_gpu.h

@@ -0,0 +1,46 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_GPU_H
+#define SOBOL_GPU_H
+
+extern "C"
+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output);
+
+#endif

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 10256 - 0
examples/pi/SobolQRNG/sobol_primitives.c


+ 60 - 0
examples/pi/SobolQRNG/sobol_primitives.h

@@ -0,0 +1,60 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_PRIMITIVES_H
+#define SOBOL_PRIMITIVES_H
+
+#define max_m 17
+
+// Each primitive is stored as a struct where
+//  dimension is the dimension number of the polynomial (unused)
+//  degree is the degree of the polynomial
+//  a is a binary word representing the coefficients 
+//  m is the array of m values
+struct primitive
+{
+    unsigned int dimension;
+    unsigned int degree;
+    unsigned int a;
+    unsigned int m[max_m];
+};
+
+extern const struct primitive sobol_primitives[];
+
+#endif

+ 34 - 35
examples/pi/pi.c

@@ -14,6 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include "SobolQRNG/sobol.h"
 #include "pi.h"
 
 #ifdef STARPU_USE_CUDA
@@ -22,9 +23,14 @@ void cuda_kernel(void **descr, void *cl_arg);
 
 static void cpu_kernel(void *descr[], void *cl_arg)
 {
-	TYPE *random_numbers_x = (TYPE *)STARPU_GET_VECTOR_PTR(descr[0]);
-	TYPE *random_numbers_y = (TYPE *)STARPU_GET_VECTOR_PTR(descr[1]);
-	unsigned nx = STARPU_GET_VECTOR_NX(descr[0]);
+	unsigned *directions = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
+	unsigned nx = NSHOT_PER_TASK;
+
+	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
+	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
+
+	TYPE *random_numbers_x = &random_numbers[0];
+	TYPE *random_numbers_y = &random_numbers[nx];
 
 	unsigned current_cnt = 0;
 
@@ -40,42 +46,33 @@ static void cpu_kernel(void *descr[], void *cl_arg)
 		current_cnt += success;
 	}
 
-	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[2]);
+	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[1]);
 	*cnt = current_cnt;
+
+	free(random_numbers);
 }
 
+
+
 int main(int argc, char **argv)
 {
 	unsigned i;
 
 	starpu_init(NULL);
 
-	TYPE *random_array_x;
-	starpu_malloc_pinned_if_possible((void **)&random_array_x, SIZE*sizeof(TYPE));
-	STARPU_ASSERT(random_array_x);
-
-	TYPE *random_array_y;
-	starpu_malloc_pinned_if_possible((void **)&random_array_y, SIZE*sizeof(TYPE));
-	STARPU_ASSERT(random_array_y);
-
-	unsigned *cnt_array;
-	starpu_malloc_pinned_if_possible((void **)&cnt_array, NTASKS*sizeof(unsigned));
-	STARPU_ASSERT(cnt_array);
-
-	/* First generate an array of random numbers */
-	for (i = 0; i < SIZE; i++)
-	{
-		random_array_x[i] = (((TYPE)rand()/(TYPE)RAND_MAX)*2.0 - 1.0);
-		random_array_y[i] = (((TYPE)rand()/(TYPE)RAND_MAX)*2.0 - 1.0);
-	}
+	/* Initialize the random number generator */
+	unsigned *sobol_qrng_directions = malloc(n_dimensions*n_directions*sizeof(unsigned));
+	STARPU_ASSERT(sobol_qrng_directions);
 
-	/* Register the entire array */
-	starpu_data_handle random_array_handle_x;
-	starpu_register_vector_data(&random_array_handle_x, 0, (uintptr_t)random_array_x, SIZE, sizeof(TYPE));
+	initSobolDirectionVectors(n_dimensions, sobol_qrng_directions);
 
-	starpu_data_handle random_array_handle_y;
-	starpu_register_vector_data(&random_array_handle_y, 0, (uintptr_t)random_array_y, SIZE, sizeof(TYPE));
+	/* Any worker may use that array now */
+	starpu_data_handle sobol_qrng_direction_handle;
+	starpu_register_vector_data(&sobol_qrng_direction_handle, 0,
+		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
 
+	unsigned *cnt_array = malloc(NTASKS*sizeof(unsigned));
+	STARPU_ASSERT(cnt_array);
 	starpu_data_handle cnt_array_handle;
 	starpu_register_vector_data(&cnt_array_handle, 0, (uintptr_t)cnt_array, NTASKS, sizeof(unsigned));
 
@@ -86,8 +83,10 @@ int main(int argc, char **argv)
 		.filter_arg = NTASKS
 	};
 	
+#if 0
 	starpu_partition_data(random_array_handle_x, &f);
 	starpu_partition_data(random_array_handle_y, &f);
+#endif
 	starpu_partition_data(cnt_array_handle, &f);
 
 	static struct starpu_perfmodel_t model = {
@@ -101,7 +100,7 @@ int main(int argc, char **argv)
 #ifdef STARPU_USE_CUDA
 		.cuda_func = cuda_kernel,
 #endif
-		.nbuffers = 3,
+		.nbuffers = 2,
 		.model = &model
 	};
 
@@ -111,12 +110,12 @@ int main(int argc, char **argv)
 
 		task->cl = &cl;
 
-		task->buffers[0].handle = starpu_get_sub_data(random_array_handle_x, 1, i);
+		STARPU_ASSERT(starpu_get_sub_data(cnt_array_handle, 1, i));
+
+		task->buffers[0].handle = sobol_qrng_direction_handle;
 		task->buffers[0].mode   = STARPU_R;
-		task->buffers[1].handle = starpu_get_sub_data(random_array_handle_y, 1, i);
-		task->buffers[1].mode   = STARPU_R;
-		task->buffers[2].handle = starpu_get_sub_data(cnt_array_handle, 1, i);
-		task->buffers[2].mode   = STARPU_W;
+		task->buffers[1].handle = starpu_get_sub_data(cnt_array_handle, 1, i);
+		task->buffers[1].mode   = STARPU_W;
 
 		int ret = starpu_submit_task(task);
 		STARPU_ASSERT(!ret);
@@ -129,7 +128,7 @@ int main(int argc, char **argv)
 	starpu_sync_data_with_mem(cnt_array_handle, STARPU_RW);
 
 	/* Count the total number of entries */
-	unsigned total_cnt = 0;
+	unsigned long total_cnt = 0;
 	for (i = 0; i < NTASKS; i++)
 		total_cnt += cnt_array[i];
 
@@ -139,7 +138,7 @@ int main(int argc, char **argv)
 
 	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
 
-	fprintf(stderr, "Pi approximation : %f (%d / %d)\n", ((TYPE)total_cnt*4)/(SIZE), total_cnt, SIZE);
+	fprintf(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(SIZE), total_cnt, SIZE);
 
 	return 0;
 }

+ 3 - 1
examples/pi/pi.h

@@ -20,7 +20,7 @@
 #include <starpu.h>
 #include <stdio.h>
 
-#define NTASKS	(64ULL)
+#define NTASKS	(16384ULL)
 #define NSHOT_PER_TASK	(16*1024*1024ULL)
 
 #define SIZE	(NTASKS*NSHOT_PER_TASK)
@@ -29,4 +29,6 @@
 
 //extern "C" void cuda_kernel(void *descr[], void *cl_arg);
 
+static int n_dimensions = 100;
+
 #endif // __PI_H__

+ 16 - 4
examples/pi/pi_kernel.cu

@@ -14,6 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include "SobolQRNG/sobol_gpu.h"
 #include "pi.h"
 
 #define MAXNBLOCKS	128
@@ -99,11 +100,21 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 {
 	cudaError_t cures;
 
-	TYPE *random_numbers_x = (TYPE *)STARPU_GET_VECTOR_PTR(descr[0]);
-	TYPE *random_numbers_y = (TYPE *)STARPU_GET_VECTOR_PTR(descr[1]);
-	unsigned nx = STARPU_GET_VECTOR_NX(descr[0]);
+	unsigned *directions = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
+	unsigned nx = NSHOT_PER_TASK;
 
-	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[2]);
+	/* Generate Random numbers */
+	float *random_numbers;
+	cudaMalloc((void **)&random_numbers, 2*nx*sizeof(float));
+	STARPU_ASSERT(random_numbers);
+	
+	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
+	cudaThreadSynchronize();
+
+	TYPE *random_numbers_x = &random_numbers[0];
+	TYPE *random_numbers_y = &random_numbers[nx];
+
+	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[1]);
 
 	/* How many blocks do we use ? */ 
 	unsigned nblocks = 128; // TODO
@@ -133,4 +144,5 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 	cudaFree(per_block_cnt);
+	cudaFree(random_numbers);
 }