| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 | #include <stdio.h>#include <stdint.h>#include <math.h>#include <starpu.h>// __device__ inline double cndGPU(double d)// {//   const double A1 = 0.31938153f;//   const double A2 = -0.356563782f;//   const double A3 = 1.781477937f;//   const double A4 = -1.821255978f;//   const double A5 = 1.330274429f;//   const float RSQRT2PI = 0.39894228040143267793994605993438f;    //   double K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d)));    //   double cnd = RSQRT2PI * __expf(- 0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));//     if (d > 0)//       cnd = 1.0f - cnd;//     return cnd;// }__device__ inline double cndGPU(double d){  return (1.0 + erf(d/sqrt(2.0)))/2.0;}__global__ void gpuBlackScholesKernel(double *S, double *K, double *R, double *T, 				      double *SIG, double *CRES, double *PRES,				      uint32_t nxS){  uint32_t i, id;    id = blockIdx.x * blockDim.x + threadIdx.x;  i = id % nxS;    double sqrtT = __fdividef(1.0F, rsqrtf(T[i]));  double d1 = (log(S[i] / K[i]) + (R[i] + SIG[i] * SIG[i] * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));    double d2 = (log(S[i] / K[i]) + (R[i] - SIG[i] * SIG[i] * 0.5) * T[i]) / (SIG[i] * sqrt(T[i]));    CRES[i] = S[i] * (normcdf(d1)) - K[i] * exp(-R[i] * T[i]) * normcdf(d2);  PRES[i] = -S[i] * (normcdf(-d1)) + K[i] * exp(-R[i] * T[i]) * normcdf(-d2);}#define THREADS_PER_BLOCK 64extern "C" void gpu_black_scholes(void *descr[], void *args){  double *S, *K, *R, *T, *SIG, *CRES, *PRES;  uint32_t nxS;  uint32_t nblocks;  S = (double *) STARPU_MATRIX_GET_PTR(descr[0]);  K = (double *) STARPU_MATRIX_GET_PTR(descr[1]);  R = (double *) STARPU_MATRIX_GET_PTR(descr[2]);  T = (double *) STARPU_MATRIX_GET_PTR(descr[3]);  SIG = (double *) STARPU_MATRIX_GET_PTR(descr[4]);  CRES = (double *) STARPU_MATRIX_GET_PTR(descr[5]);  PRES = (double *) STARPU_MATRIX_GET_PTR(descr[6]);  nxS = STARPU_MATRIX_GET_NX(descr[0]);  nblocks = (nxS + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;  gpuBlackScholesKernel    <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()    >>> (S, K, R, T, SIG, CRES, PRES, nxS);    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
 |