sobol_gpu.cu 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. /*
  2. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
  3. *
  4. * NVIDIA Corporation and its licensors retain all intellectual property and
  5. * proprietary rights in and to this software and related documentation and
  6. * any modifications thereto. Any use, reproduction, disclosure, or distribution
  7. * of this software and related documentation without an express license
  8. * agreement from NVIDIA Corporation is strictly prohibited.
  9. *
  10. */
  11. /*
  12. * Portions Copyright (c) 1993-2009 NVIDIA Corporation. All rights reserved.
  13. * Portions Copyright (c) 2009 Mike Giles, Oxford University. All rights reserved.
  14. * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe. All rights reserved.
  15. *
  16. * Sobol Quasi-random Number Generator example
  17. *
  18. * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
  19. * http://people.maths.ox.ac.uk/~gilesm/
  20. *
  21. * and C code developed by Stephen Joe, University of Waikato, New Zealand
  22. * and Frances Kuo, University of New South Wales, Australia
  23. * http://web.maths.unsw.edu.au/~fkuo/sobol/
  24. *
  25. * For theoretical background see:
  26. *
  27. * P. Bratley and B.L. Fox.
  28. * Implementing Sobol's quasirandom sequence generator
  29. * http://portal.acm.org/citation.cfm?id=42288
  30. * ACM Trans. on Math. Software, 14(1):88-100, 1988
  31. *
  32. * S. Joe and F. Kuo.
  33. * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
  34. * http://portal.acm.org/citation.cfm?id=641879
  35. * ACM Trans. on Math. Software, 29(1):49-57, 2003
  36. *
  37. */
  38. #include "sobol.h"
  39. #include "sobol_gpu.h"
  40. #include <starpu.h>
  41. #include <starpu_cuda.h>
  42. #define k_2powneg32 2.3283064E-10F
  43. __global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
  44. {
  45. __shared__ unsigned int v[n_directions];
  46. // Offset into the correct dimension as specified by the
  47. // block y coordinate
  48. d_directions = d_directions + n_directions * blockIdx.y;
  49. d_output = d_output + n_vectors * blockIdx.y;
  50. // Copy the direction numbers for this dimension into shared
  51. // memory - there are only 32 direction numbers so only the
  52. // first 32 (n_directions) threads need participate.
  53. if (threadIdx.x < n_directions)
  54. {
  55. v[threadIdx.x] = d_directions[threadIdx.x];
  56. }
  57. __syncthreads();
  58. // Set initial index (i.e. which vector this thread is
  59. // computing first) and stride (i.e. step to the next vector
  60. // for this thread)
  61. int i0 = threadIdx.x + blockIdx.x * blockDim.x;
  62. int stride = gridDim.x * blockDim.x;
  63. // Get the gray code of the index
  64. // c.f. Numerical Recipes in C, chapter 20
  65. // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
  66. unsigned int g = i0 ^ (i0 >> 1);
  67. // Initialisation for first point x[i0]
  68. // In the Bratley and Fox paper this is equation (*), where
  69. // we are computing the value for x[n] without knowing the
  70. // value of x[n-1].
  71. unsigned int X = 0;
  72. unsigned int mask;
  73. for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
  74. {
  75. // We want X ^= g_k * v[k], where g_k is one or zero.
  76. // We do this by setting a mask with all bits equal to
  77. // g_k. In reality we keep shifting g so that g_k is the
  78. // LSB of g. This way we avoid multiplication.
  79. mask = - (g & 1);
  80. X ^= mask & v[k];
  81. g = g >> 1;
  82. }
  83. if (i0 < n_vectors)
  84. {
  85. d_output[i0] = (float)X * k_2powneg32;
  86. }
  87. // Now do rest of points, using the stride
  88. // Here we want to generate x[i] from x[i-stride] where we
  89. // don't have any of the x in between, therefore we have to
  90. // revisit the equation (**), this is easiest with an example
  91. // so assume stride is 16.
  92. // From x[n] to x[n+16] there will be:
  93. // 8 changes in the first bit
  94. // 4 changes in the second bit
  95. // 2 changes in the third bit
  96. // 1 change in the fourth
  97. // 1 change in one of the remaining bits
  98. //
  99. // What this means is that in the equation:
  100. // x[n+1] = x[n] ^ v[p]
  101. // x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
  102. // ...
  103. // We will apply xor with v[1] eight times, v[2] four times,
  104. // v[3] twice, v[4] once and one other direction number once.
  105. // Since two xors cancel out, we can skip even applications
  106. // and just apply xor with v[4] (i.e. log2(16)) and with
  107. // the current applicable direction number.
  108. // Note that all these indices count from 1, so we need to
  109. // subtract 1 from them all to account for C arrays counting
  110. // from zero.
  111. unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
  112. unsigned int v_stridemask = stride - 1;
  113. for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
  114. {
  115. // x[i] = x[i-stride] ^ v[b] ^ v[c]
  116. // where b is log2(stride) minus 1 for C array indexing
  117. // where c is the index of the rightmost zero bit in i,
  118. // not including the bottom log2(stride) bits, minus 1
  119. // for C array indexing
  120. // In the Bratley and Fox paper this is equation (**)
  121. X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
  122. d_output[i] = (float)X * k_2powneg32;
  123. }
  124. }
  125. extern "C"
  126. void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
  127. {
  128. const int threadsperblock = 64;
  129. // Set up the execution configuration
  130. dim3 dimGrid;
  131. dim3 dimBlock;
  132. // This implementation of the generator outputs all the draws for
  133. // one dimension in a contiguous region of memory, followed by the
  134. // next dimension and so on.
  135. // Therefore all threads within a block will be processing different
  136. // vectors from the same dimension. As a result we want the total
  137. // number of blocks to be a multiple of the number of dimensions.
  138. dimGrid.y = n_dimensions;
  139. // If the number of dimensions is large then we will set the number
  140. // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
  141. // but if the number of dimensions is small (e.g. less than 32) then
  142. // we'll partition the vectors across blocks (as well as threads).
  143. // We also need to cap the dimGrid.x where the number of vectors
  144. // is too small to be partitioned.
  145. dimGrid.x = 1 + 31 / n_dimensions;
  146. if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
  147. {
  148. dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
  149. }
  150. // Fix the number of threads
  151. dimBlock.x = threadsperblock;
  152. // Execute GPU kernel
  153. sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
  154. }