/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2019 Mael Keryell * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ #include #include #include #include // __device__ inline double cndGPU(double d) // { // const double A1 = 0.31938153f; // const double A2 = -0.356563782f; // const double A3 = 1.781477937f; // const double A4 = -1.821255978f; // const double A5 = 1.330274429f; // const float RSQRT2PI = 0.39894228040143267793994605993438f; // double K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); // double cnd = RSQRT2PI * __expf(- 0.5f * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); // if (d > 0) // cnd = 1.0f - cnd; // return cnd; // } __device__ inline double cndGPU(double d) { return (1.0 + erf(d/sqrt(2.0)))/2.0; } __global__ void gpuBlackScholesKernel(double *S, double *K, double *R, double *T, double *SIG, double *CRES, double *PRES, uint32_t nxS) { uint32_t i, id; id = blockIdx.x * blockDim.x + threadIdx.x; i = id % nxS; double sqrtT = __fdividef(1.0F, rsqrtf(T[i])); double d1 = (log(S[i] / K[i]) + (R[i] + SIG[i] * SIG[i] * 0.5) * T[i]) / (SIG[i] * sqrt(T[i])); double d2 = (log(S[i] / K[i]) + (R[i] - SIG[i] * SIG[i] * 0.5) * T[i]) / (SIG[i] * sqrt(T[i])); CRES[i] = S[i] * (normcdf(d1)) - K[i] * exp(-R[i] * T[i]) * normcdf(d2); PRES[i] = -S[i] * (normcdf(-d1)) + K[i] * exp(-R[i] * T[i]) * normcdf(-d2); } #define THREADS_PER_BLOCK 64 extern "C" void gpu_black_scholes(void *descr[], void *args) { double *S, *K, *R, *T, *SIG, *CRES, *PRES; uint32_t nxS; uint32_t nblocks; S = (double *) STARPU_MATRIX_GET_PTR(descr[0]); K = (double *) STARPU_MATRIX_GET_PTR(descr[1]); R = (double *) STARPU_MATRIX_GET_PTR(descr[2]); T = (double *) STARPU_MATRIX_GET_PTR(descr[3]); SIG = (double *) STARPU_MATRIX_GET_PTR(descr[4]); CRES = (double *) STARPU_MATRIX_GET_PTR(descr[5]); PRES = (double *) STARPU_MATRIX_GET_PTR(descr[6]); nxS = STARPU_MATRIX_GET_NX(descr[0]); nblocks = (nxS + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; gpuBlackScholesKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (S, K, R, T, SIG, CRES, PRES, nxS); cudaStreamSynchronize(starpu_cuda_get_local_stream()); }