15 vuotta sitten · 6b2bcd48a5
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -98,6 +98,11 @@ noinst_HEADERS = 				\
 
				 	lu/xlu_kernels.h			\
			
 
				 	lu/float.h				\
			
 
				 	lu/double.h				\
			
 
				+	pi/pi.h					\
			
 
				+	pi/SobolQRNG/sobol.h			\
			
 
				+	pi/SobolQRNG/sobol_gold.h		\
			
 
				+	pi/SobolQRNG/sobol_gpu.h		\
			
 
				+	pi/SobolQRNG/sobol_primitives.h		\
			
 
				 	cholesky/dw_cholesky_models.h		\
			
 
				 	cholesky/dw_cholesky.h			\
			
 
				 	common/blas_model.h			\
			
@@ -163,11 +168,14 @@ examplebin_PROGRAMS +=				\
 
				 	pi/pi
			
 
				 
			
 
				 pi_pi_SOURCES =					\
			
 
				-	pi/pi.c
			
 
				+	pi/pi.c					\
			
 
				+	pi/SobolQRNG/sobol_gold.c		\
			
 
				+	pi/SobolQRNG/sobol_primitives.c
			
 
				 
			
 
				 if STARPU_USE_CUDA
			
 
				 pi_pi_SOURCES +=				\
			
 
				-	pi/pi_kernel.cu
			
 
				+	pi/pi_kernel.cu				\
			
 
				+	pi/SobolQRNG/sobol_gpu.cu
			
 
				 endif
			
 
				 
			
 
				 
			
--- a/examples/pi/SobolQRNG/CforCUDA_SDK_license.txt
+++ b/examples/pi/SobolQRNG/CforCUDA_SDK_license.txt
--- a/examples/pi/SobolQRNG/sobol.h
+++ b/examples/pi/SobolQRNG/sobol.h
@@ -0,0 +1,45 @@
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOBOL_H
			
 
				+#define SOBOL_H
			
 
				+
			
 
				+// Number of direction vectors is fixed to 32
			
 
				+#define n_directions 32
			
 
				+
			
 
				+#endif
			
--- a/examples/pi/SobolQRNG/sobol_gold.c
+++ b/examples/pi/SobolQRNG/sobol_gold.c
@@ -0,0 +1,126 @@
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <math.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "sobol.h"
			
 
				+#include "sobol_gold.h"
			
 
				+#include "sobol_primitives.h"
			
 
				+
			
 
				+#define k_2powneg32 2.3283064E-10F
			
 
				+
			
 
				+// Create the direction numbers, based on the primitive polynomials.
			
 
				+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
			
 
				+{
			
 
				+    unsigned int *v = directions;
			
 
				+
			
 
				+    int dim;
			
 
				+    for (dim = 0 ; dim < n_dimensions ; dim++)
			
 
				+    {
			
 
				+        // First dimension is a special case
			
 
				+        if (dim == 0)
			
 
				+        {
			
 
				+            int i;
			
 
				+            for (i = 0 ; i < n_directions ; i++)
			
 
				+            {
			
 
				+                // All m's are 1
			
 
				+                v[i] = 1 << (31 - i);
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            int d = sobol_primitives[dim].degree;
			
 
				+            // The first direction numbers (up to the degree of the polynomial)
			
 
				+            // are simply v[i] = m[i] / 2^i (stored in Q0.32 format)
			
 
				+            int i;
			
 
				+            for (i = 0 ; i < d ; i++)
			
 
				+            {
			
 
				+                v[i] = sobol_primitives[dim].m[i] << (31 - i);
			
 
				+            }
			
 
				+            // The remaining direction numbers are computed as described in
			
 
				+            // the Bratley and Fox paper.
			
 
				+            // v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ v[i-d]/2^d
			
 
				+            for (i = d ; i < n_directions ; i++)
			
 
				+            {
			
 
				+                // First do the v[i-d] ^ v[i-d]/2^d part
			
 
				+                v[i] = v[i - d] ^ (v[i - d] >> d);
			
 
				+                // Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part
			
 
				+                // Note that the coefficients a[] are zero or one and for compactness in
			
 
				+                // the input tables they are stored as bits of a single integer. To extract
			
 
				+                // the relevant bit we use right shift and mask with 1.
			
 
				+                // For example, for a 10 degree polynomial there are ten useful bits in a,
			
 
				+                // so to get a[2] we need to right shift 7 times (to get the 8th bit into
			
 
				+                // the LSB) and then mask with 1.
			
 
				+                int j;
			
 
				+                for (j = 1 ; j < d ; j++)
			
 
				+                {
			
 
				+                    v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        v += n_directions;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Reference model for generating Sobol numbers on the host
			
 
				+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output)
			
 
				+{
			
 
				+    unsigned int *v = directions;
			
 
				+
			
 
				+    int d;
			
 
				+    for (d = 0 ; d < n_dimensions ; d++)
			
 
				+    {
			
 
				+        unsigned int X = 0;
			
 
				+        // x[0] is zero (in all dimensions)
			
 
				+        output[n_vectors * d] = 0.0;        
			
 
				+        int i;
			
 
				+        for (i = 1 ; i < n_vectors ; i++)
			
 
				+        {
			
 
				+            // x[i] = x[i-1] ^ v[c]
			
 
				+            //  where c is the index of the rightmost zero bit in i
			
 
				+            //  minus 1 (since C arrays count from zero)
			
 
				+            // In the Bratley and Fox paper this is equation (**)
			
 
				+            X ^= v[ffs(~(i - 1)) - 1];
			
 
				+            output[i + n_vectors * d] = (float)X * k_2powneg32;
			
 
				+        }
			
 
				+        v += n_directions;
			
 
				+    }
			
 
				+}
			
--- a/examples/pi/SobolQRNG/sobol_gold.h
+++ b/examples/pi/SobolQRNG/sobol_gold.h
@@ -0,0 +1,46 @@
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOBOL_GOLD_H
			
 
				+#define SOBOL_GOLD_H
			
 
				+
			
 
				+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions);
			
 
				+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output);
			
 
				+
			
 
				+#endif
			
--- a/examples/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/pi/SobolQRNG/sobol_gpu.cu
@@ -0,0 +1,168 @@
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#include "sobol.h"
			
 
				+#include "sobol_gpu.h"
			
 
				+
			
 
				+#define k_2powneg32 2.3283064E-10F
			
 
				+
			
 
				+__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
			
 
				+{
			
 
				+    __shared__ unsigned int v[n_directions];
			
 
				+
			
 
				+    // Offset into the correct dimension as specified by the
			
 
				+    // block y coordinate
			
 
				+    d_directions = d_directions + n_directions * blockIdx.y;
			
 
				+    d_output = d_output +  n_vectors * blockIdx.y;
			
 
				+
			
 
				+    // Copy the direction numbers for this dimension into shared
			
 
				+    // memory - there are only 32 direction numbers so only the
			
 
				+    // first 32 (n_directions) threads need participate.
			
 
				+    if (threadIdx.x < n_directions)
			
 
				+    {
			
 
				+	    v[threadIdx.x] = d_directions[threadIdx.x];
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    // Set initial index (i.e. which vector this thread is
			
 
				+    // computing first) and stride (i.e. step to the next vector
			
 
				+    // for this thread)
			
 
				+    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    int stride = gridDim.x * blockDim.x;
			
 
				+
			
 
				+    // Get the gray code of the index
			
 
				+    // c.f. Numerical Recipes in C, chapter 20
			
 
				+    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
			
 
				+    unsigned int g = i0 ^ (i0 >> 1);
			
 
				+
			
 
				+    // Initialisation for first point x[i0]
			
 
				+    // In the Bratley and Fox paper this is equation (*), where
			
 
				+    // we are computing the value for x[n] without knowing the
			
 
				+    // value of x[n-1].
			
 
				+    unsigned int X = 0;
			
 
				+    unsigned int mask;
			
 
				+    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
			
 
				+    {
			
 
				+        // We want X ^= g_k * v[k], where g_k is one or zero.
			
 
				+        // We do this by setting a mask with all bits equal to
			
 
				+        // g_k. In reality we keep shifting g so that g_k is the
			
 
				+        // LSB of g. This way we avoid multiplication.
			
 
				+        mask = - (g & 1);
			
 
				+        X ^= mask & v[k];
			
 
				+        g = g >> 1;
			
 
				+    }
			
 
				+    if (i0 < n_vectors)
			
 
				+    {
			
 
				+        d_output[i0] = (float)X * k_2powneg32;
			
 
				+    }
			
 
				+
			
 
				+    // Now do rest of points, using the stride
			
 
				+    // Here we want to generate x[i] from x[i-stride] where we
			
 
				+    // don't have any of the x in between, therefore we have to
			
 
				+    // revisit the equation (**), this is easiest with an example
			
 
				+    // so assume stride is 16.
			
 
				+    // From x[n] to x[n+16] there will be:
			
 
				+    //   8 changes in the first bit
			
 
				+    //   4 changes in the second bit
			
 
				+    //   2 changes in the third bit
			
 
				+    //   1 change in the fourth
			
 
				+    //   1 change in one of the remaining bits
			
 
				+    //
			
 
				+    // What this means is that in the equation:
			
 
				+    //   x[n+1] = x[n] ^ v[p]
			
 
				+    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
			
 
				+    //   ...
			
 
				+    // We will apply xor with v[1] eight times, v[2] four times,
			
 
				+    // v[3] twice, v[4] once and one other direction number once.
			
 
				+    // Since two xors cancel out, we can skip even applications
			
 
				+    // and just apply xor with v[4] (i.e. log2(16)) and with
			
 
				+    // the current applicable direction number.
			
 
				+    // Note that all these indices count from 1, so we need to
			
 
				+    // subtract 1 from them all to account for C arrays counting
			
 
				+    // from zero.
			
 
				+    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
			
 
				+    unsigned int v_stridemask = stride - 1;
			
 
				+    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
			
 
				+    {
			
 
				+        // x[i] = x[i-stride] ^ v[b] ^ v[c]
			
 
				+        //  where b is log2(stride) minus 1 for C array indexing
			
 
				+        //  where c is the index of the rightmost zero bit in i,
			
 
				+        //  not including the bottom log2(stride) bits, minus 1
			
 
				+        //  for C array indexing
			
 
				+        // In the Bratley and Fox paper this is equation (**)
			
 
				+        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
			
 
				+        d_output[i] = (float)X * k_2powneg32;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+extern "C"
			
 
				+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
			
 
				+{
			
 
				+    const int threadsperblock = 64;
			
 
				+
			
 
				+    // Set up the execution configuration
			
 
				+    dim3 dimGrid;
			
 
				+    dim3 dimBlock;
			
 
				+
			
 
				+    // This implementation of the generator outputs all the draws for
			
 
				+    // one dimension in a contiguous region of memory, followed by the
			
 
				+    // next dimension and so on.
			
 
				+    // Therefore all threads within a block will be processing different
			
 
				+    // vectors from the same dimension. As a result we want the total
			
 
				+    // number of blocks to be a multiple of the number of dimensions.
			
 
				+    dimGrid.y = n_dimensions;
			
 
				+
			
 
				+    // If the number of dimensions is large then we will set the number
			
 
				+    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
			
 
				+    // but if the number of dimensions is small (e.g. less than 32) then
			
 
				+    // we'll partition the vectors across blocks (as well as threads).
			
 
				+    // We also need to cap the dimGrid.x where the number of vectors
			
 
				+    // is too small to be partitioned.
			
 
				+    dimGrid.x = 1 + 31 / n_dimensions;
			
 
				+    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
			
 
				+    {
			
 
				+        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
			
 
				+    }
			
 
				+    
			
 
				+    // Fix the number of threads
			
 
				+    dimBlock.x = threadsperblock;
			
 
				+
			
 
				+    // Execute GPU kernel
			
 
				+    sobolGPU_kernel<<<dimGrid, dimBlock>>>(n_vectors, n_dimensions, d_directions, d_output);
			
 
				+}
			
--- a/examples/pi/SobolQRNG/sobol_gpu.h
+++ b/examples/pi/SobolQRNG/sobol_gpu.h
@@ -0,0 +1,46 @@
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOBOL_GPU_H
			
 
				+#define SOBOL_GPU_H
			
 
				+
			
 
				+extern "C"
			
 
				+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output);
			
 
				+
			
 
				+#endif
			
--- a/examples/pi/SobolQRNG/sobol_primitives.c
+++ b/examples/pi/SobolQRNG/sobol_primitives.c
--- a/examples/pi/SobolQRNG/sobol_primitives.h
+++ b/examples/pi/SobolQRNG/sobol_primitives.h
@@ -0,0 +1,60 @@
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOBOL_PRIMITIVES_H
			
 
				+#define SOBOL_PRIMITIVES_H
			
 
				+
			
 
				+#define max_m 17
			
 
				+
			
 
				+// Each primitive is stored as a struct where
			
 
				+//  dimension is the dimension number of the polynomial (unused)
			
 
				+//  degree is the degree of the polynomial
			
 
				+//  a is a binary word representing the coefficients 
			
 
				+//  m is the array of m values
			
 
				+struct primitive
			
 
				+{
			
 
				+    unsigned int dimension;
			
 
				+    unsigned int degree;
			
 
				+    unsigned int a;
			
 
				+    unsigned int m[max_m];
			
 
				+};
			
 
				+
			
 
				+extern const struct primitive sobol_primitives[];
			
 
				+
			
 
				+#endif
			
--- a/examples/pi/pi.c
+++ b/examples/pi/pi.c
@@ -14,6 +14,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include "SobolQRNG/sobol.h"
			
 
				 #include "pi.h"
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -22,9 +23,14 @@ void cuda_kernel(void **descr, void *cl_arg);
 
				 
			
 
				 static void cpu_kernel(void *descr[], void *cl_arg)
			
 
				 {
			
 
				-	TYPE *random_numbers_x = (TYPE *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				-	TYPE *random_numbers_y = (TYPE *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				-	unsigned nx = STARPU_GET_VECTOR_NX(descr[0]);
			
 
				+	unsigned *directions = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+	unsigned nx = NSHOT_PER_TASK;
			
 
				+
			
 
				+	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
			
 
				+	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
			
 
				+
			
 
				+	TYPE *random_numbers_x = &random_numbers[0];
			
 
				+	TYPE *random_numbers_y = &random_numbers[nx];
			
 
				 
			
 
				 	unsigned current_cnt = 0;
			
 
				 
			
@@ -40,42 +46,33 @@ static void cpu_kernel(void *descr[], void *cl_arg)
 
				 		current_cnt += success;
			
 
				 	}
			
 
				 
			
 
				-	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[2]);
			
 
				+	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				 	*cnt = current_cnt;
			
 
				+
			
 
				+	free(random_numbers);
			
 
				 }
			
 
				 
			
 
				+
			
 
				+
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	unsigned i;
			
 
				 
			
 
				 	starpu_init(NULL);
			
 
				 
			
 
				-	TYPE *random_array_x;
			
 
				-	starpu_malloc_pinned_if_possible((void **)&random_array_x, SIZE*sizeof(TYPE));
			
 
				-	STARPU_ASSERT(random_array_x);
			
 
				-
			
 
				-	TYPE *random_array_y;
			
 
				-	starpu_malloc_pinned_if_possible((void **)&random_array_y, SIZE*sizeof(TYPE));
			
 
				-	STARPU_ASSERT(random_array_y);
			
 
				-
			
 
				-	unsigned *cnt_array;
			
 
				-	starpu_malloc_pinned_if_possible((void **)&cnt_array, NTASKS*sizeof(unsigned));
			
 
				-	STARPU_ASSERT(cnt_array);
			
 
				-
			
 
				-	/* First generate an array of random numbers */
			
 
				-	for (i = 0; i < SIZE; i++)
			
 
				-	{
			
 
				-		random_array_x[i] = (((TYPE)rand()/(TYPE)RAND_MAX)*2.0 - 1.0);
			
 
				-		random_array_y[i] = (((TYPE)rand()/(TYPE)RAND_MAX)*2.0 - 1.0);
			
 
				-	}
			
 
				+	/* Initialize the random number generator */
			
 
				+	unsigned *sobol_qrng_directions = malloc(n_dimensions*n_directions*sizeof(unsigned));
			
 
				+	STARPU_ASSERT(sobol_qrng_directions);
			
 
				 
			
 
				-	/* Register the entire array */
			
 
				-	starpu_data_handle random_array_handle_x;
			
 
				-	starpu_register_vector_data(&random_array_handle_x, 0, (uintptr_t)random_array_x, SIZE, sizeof(TYPE));
			
 
				+	initSobolDirectionVectors(n_dimensions, sobol_qrng_directions);
			
 
				 
			
 
				-	starpu_data_handle random_array_handle_y;
			
 
				-	starpu_register_vector_data(&random_array_handle_y, 0, (uintptr_t)random_array_y, SIZE, sizeof(TYPE));
			
 
				+	/* Any worker may use that array now */
			
 
				+	starpu_data_handle sobol_qrng_direction_handle;
			
 
				+	starpu_register_vector_data(&sobol_qrng_direction_handle, 0,
			
 
				+		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
			
 
				 
			
 
				+	unsigned *cnt_array = malloc(NTASKS*sizeof(unsigned));
			
 
				+	STARPU_ASSERT(cnt_array);
			
 
				 	starpu_data_handle cnt_array_handle;
			
 
				 	starpu_register_vector_data(&cnt_array_handle, 0, (uintptr_t)cnt_array, NTASKS, sizeof(unsigned));
			
 
				 
			
@@ -86,8 +83,10 @@ int main(int argc, char **argv)
 
				 		.filter_arg = NTASKS
			
 
				 	};
			
 
				 	
			
 
				+#if 0
			
 
				 	starpu_partition_data(random_array_handle_x, &f);
			
 
				 	starpu_partition_data(random_array_handle_y, &f);
			
 
				+#endif
			
 
				 	starpu_partition_data(cnt_array_handle, &f);
			
 
				 
			
 
				 	static struct starpu_perfmodel_t model = {
			
@@ -101,7 +100,7 @@ int main(int argc, char **argv)
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		.cuda_func = cuda_kernel,
			
 
				 #endif
			
 
				-		.nbuffers = 3,
			
 
				+		.nbuffers = 2,
			
 
				 		.model = &model
			
 
				 	};
			
 
				 
			
@@ -111,12 +110,12 @@ int main(int argc, char **argv)
 
				 
			
 
				 		task->cl = &cl;
			
 
				 
			
 
				-		task->buffers[0].handle = starpu_get_sub_data(random_array_handle_x, 1, i);
			
 
				+		STARPU_ASSERT(starpu_get_sub_data(cnt_array_handle, 1, i));
			
 
				+
			
 
				+		task->buffers[0].handle = sobol_qrng_direction_handle;
			
 
				 		task->buffers[0].mode   = STARPU_R;
			
 
				-		task->buffers[1].handle = starpu_get_sub_data(random_array_handle_y, 1, i);
			
 
				-		task->buffers[1].mode   = STARPU_R;
			
 
				-		task->buffers[2].handle = starpu_get_sub_data(cnt_array_handle, 1, i);
			
 
				-		task->buffers[2].mode   = STARPU_W;
			
 
				+		task->buffers[1].handle = starpu_get_sub_data(cnt_array_handle, 1, i);
			
 
				+		task->buffers[1].mode   = STARPU_W;
			
 
				 
			
 
				 		int ret = starpu_submit_task(task);
			
 
				 		STARPU_ASSERT(!ret);
			
@@ -129,7 +128,7 @@ int main(int argc, char **argv)
 
				 	starpu_sync_data_with_mem(cnt_array_handle, STARPU_RW);
			
 
				 
			
 
				 	/* Count the total number of entries */
			
 
				-	unsigned total_cnt = 0;
			
 
				+	unsigned long total_cnt = 0;
			
 
				 	for (i = 0; i < NTASKS; i++)
			
 
				 		total_cnt += cnt_array[i];
			
 
				 
			
@@ -139,7 +138,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
			
 
				 
			
 
				-	fprintf(stderr, "Pi approximation : %f (%d / %d)\n", ((TYPE)total_cnt*4)/(SIZE), total_cnt, SIZE);
			
 
				+	fprintf(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(SIZE), total_cnt, SIZE);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/pi/pi.h
+++ b/examples/pi/pi.h
@@ -20,7 +20,7 @@
 
				 #include <starpu.h>
			
 
				 #include <stdio.h>
			
 
				 
			
 
				-#define NTASKS	(64ULL)
			
 
				+#define NTASKS	(16384ULL)
			
 
				 #define NSHOT_PER_TASK	(16*1024*1024ULL)
			
 
				 
			
 
				 #define SIZE	(NTASKS*NSHOT_PER_TASK)
			
@@ -29,4 +29,6 @@
 
				 
			
 
				 //extern "C" void cuda_kernel(void *descr[], void *cl_arg);
			
 
				 
			
 
				+static int n_dimensions = 100;
			
 
				+
			
 
				 #endif // __PI_H__
			
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -14,6 +14,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include "SobolQRNG/sobol_gpu.h"
			
 
				 #include "pi.h"
			
 
				 
			
 
				 #define MAXNBLOCKS	128
			
@@ -99,11 +100,21 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
				 {
			
 
				 	cudaError_t cures;
			
 
				 
			
 
				-	TYPE *random_numbers_x = (TYPE *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				-	TYPE *random_numbers_y = (TYPE *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				-	unsigned nx = STARPU_GET_VECTOR_NX(descr[0]);
			
 
				+	unsigned *directions = (unsigned *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+	unsigned nx = NSHOT_PER_TASK;
			
 
				 
			
 
				-	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[2]);
			
 
				+	/* Generate Random numbers */
			
 
				+	float *random_numbers;
			
 
				+	cudaMalloc((void **)&random_numbers, 2*nx*sizeof(float));
			
 
				+	STARPU_ASSERT(random_numbers);
			
 
				+	
			
 
				+	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
			
 
				+	cudaThreadSynchronize();
			
 
				+
			
 
				+	TYPE *random_numbers_x = &random_numbers[0];
			
 
				+	TYPE *random_numbers_y = &random_numbers[nx];
			
 
				+
			
 
				+	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				 
			
 
				 	/* How many blocks do we use ? */ 
			
 
				 	unsigned nblocks = 128; // TODO
			
@@ -133,4 +144,5 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	cudaFree(per_block_cnt);
			
 
				+	cudaFree(random_numbers);
			
 
				 }