|
@@ -23,18 +23,18 @@
|
|
unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
|
|
unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
|
|
unsigned numthreads = blockDim.x * gridDim.x;
|
|
unsigned numthreads = blockDim.x * gridDim.x;
|
|
|
|
|
|
-#define DISTRIB_1d(n, func,args,stream) \
|
|
|
|
|
|
+#define DISTRIB_1d(n, func,args) \
|
|
unsigned threads_per_block = 128; \
|
|
unsigned threads_per_block = 128; \
|
|
\
|
|
\
|
|
if (n < threads_per_block) { \
|
|
if (n < threads_per_block) { \
|
|
dim3 dimGrid(n); \
|
|
dim3 dimGrid(n); \
|
|
- func <<<dimGrid, 1, stream>>> args; \
|
|
|
|
|
|
+ func <<<dimGrid, 1>>> args; \
|
|
} else { \
|
|
} else { \
|
|
dim3 dimGrid(n / threads_per_block); \
|
|
dim3 dimGrid(n / threads_per_block); \
|
|
dim3 dimBlock(threads_per_block); \
|
|
dim3 dimBlock(threads_per_block); \
|
|
- func <<<dimGrid, dimBlock, stream>>> args; \
|
|
|
|
|
|
+ func <<<dimGrid, dimBlock>>> args; \
|
|
} \
|
|
} \
|
|
- cudaStreamSynchronize(stream); \
|
|
|
|
|
|
+ cudaThreadSynchronize(); \
|
|
|
|
|
|
extern "C" __global__ void
|
|
extern "C" __global__ void
|
|
STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
|
|
STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
|
|
@@ -48,9 +48,9 @@ STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i
|
|
}
|
|
}
|
|
|
|
|
|
extern "C" void
|
|
extern "C" void
|
|
-STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream)
|
|
|
|
|
|
+STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
|
|
{
|
|
{
|
|
- DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2), stream);
|
|
|
|
|
|
+ DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2));
|
|
}
|
|
}
|
|
|
|
|
|
extern "C" __global__ void
|
|
extern "C" __global__ void
|
|
@@ -66,9 +66,9 @@ STARPUFFT(cuda_twiddle_1d)(_cuComplex * out, const _cuComplex * roots, unsigned
|
|
}
|
|
}
|
|
|
|
|
|
extern "C" void
|
|
extern "C" void
|
|
-STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream)
|
|
|
|
|
|
+STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i)
|
|
{
|
|
{
|
|
- DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i), stream);
|
|
|
|
|
|
+ DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i));
|
|
}
|
|
}
|
|
|
|
|
|
#define VARS_2d \
|
|
#define VARS_2d \
|
|
@@ -78,29 +78,29 @@ STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsign
|
|
unsigned numthreadsy = blockDim.y * gridDim.y;
|
|
unsigned numthreadsy = blockDim.y * gridDim.y;
|
|
|
|
|
|
/* FIXME: introduce threads_per_dim_n / m instead */
|
|
/* FIXME: introduce threads_per_dim_n / m instead */
|
|
-#define DISTRIB_2d(n, m, func, args, stream) \
|
|
|
|
|
|
+#define DISTRIB_2d(n, m, func, args) \
|
|
unsigned threads_per_dim = 16; \
|
|
unsigned threads_per_dim = 16; \
|
|
if (n < threads_per_dim) { \
|
|
if (n < threads_per_dim) { \
|
|
if (m < threads_per_dim) { \
|
|
if (m < threads_per_dim) { \
|
|
dim3 dimGrid(n, m); \
|
|
dim3 dimGrid(n, m); \
|
|
- func <<<dimGrid, 1, stream>>> args; \
|
|
|
|
|
|
+ func <<<dimGrid, 1>>> args; \
|
|
} else { \
|
|
} else { \
|
|
dim3 dimGrid(1, m / threads_per_dim); \
|
|
dim3 dimGrid(1, m / threads_per_dim); \
|
|
dim3 dimBlock(n, threads_per_dim); \
|
|
dim3 dimBlock(n, threads_per_dim); \
|
|
- func <<<dimGrid, dimBlock, stream>>> args; \
|
|
|
|
|
|
+ func <<<dimGrid, dimBlock>>> args; \
|
|
} \
|
|
} \
|
|
} else { \
|
|
} else { \
|
|
if (m < threads_per_dim) { \
|
|
if (m < threads_per_dim) { \
|
|
dim3 dimGrid(n / threads_per_dim, 1); \
|
|
dim3 dimGrid(n / threads_per_dim, 1); \
|
|
dim3 dimBlock(threads_per_dim, m); \
|
|
dim3 dimBlock(threads_per_dim, m); \
|
|
- func <<<dimGrid, dimBlock, stream>>> args; \
|
|
|
|
|
|
+ func <<<dimGrid, dimBlock>>> args; \
|
|
} else { \
|
|
} else { \
|
|
dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
|
|
dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
|
|
dim3 dimBlock(threads_per_dim, threads_per_dim); \
|
|
dim3 dimBlock(threads_per_dim, threads_per_dim); \
|
|
- func <<<dimGrid, dimBlock, stream>>> args; \
|
|
|
|
|
|
+ func <<<dimGrid, dimBlock>>> args; \
|
|
} \
|
|
} \
|
|
} \
|
|
} \
|
|
- cudaStreamSynchronize(stream);
|
|
|
|
|
|
+ cudaThreadSynchronize(); \
|
|
|
|
|
|
extern "C" __global__ void
|
|
extern "C" __global__ void
|
|
STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
|
|
STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
|
|
@@ -117,9 +117,9 @@ STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i
|
|
}
|
|
}
|
|
|
|
|
|
extern "C" void
|
|
extern "C" void
|
|
-STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream)
|
|
|
|
|
|
+STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
|
|
{
|
|
{
|
|
- DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2), stream);
|
|
|
|
|
|
+ DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2));
|
|
}
|
|
}
|
|
|
|
|
|
extern "C" __global__ void
|
|
extern "C" __global__ void
|
|
@@ -137,7 +137,7 @@ STARPUFFT(cuda_twiddle_2d)(_cuComplex * out, const _cuComplex * roots0, const _c
|
|
}
|
|
}
|
|
|
|
|
|
extern "C" void
|
|
extern "C" void
|
|
-STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream)
|
|
|
|
|
|
+STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
|
|
{
|
|
{
|
|
- DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j), stream);
|
|
|
|
|
|
+ DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j));
|
|
}
|
|
}
|