/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2010-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ /* * * Sequential version * */ #ifdef __STARPU_USE_CUDA /* Perform one fft of size n,m */ static void STARPUFFT(fft_3d_plan_gpu)(void *args) { STARPUFFT(plan) plan = args; cufftResult cures; int n = plan->n[0]; int m = plan->n[1]; int p = plan->n[2]; int workerid = starpu_worker_get_id_check(); cures = cufftPlan3d(&plan->plans[workerid].plan_cuda, n, m, p, _CUFFT_C2C); if (cures != CUFFT_SUCCESS) STARPU_CUFFT_REPORT_ERROR(cures); cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream()); if (cures != CUFFT_SUCCESS) STARPU_CUFFT_REPORT_ERROR(cures); } static void STARPUFFT(fft_3d_kernel_gpu)(void *descr[], void *args) { STARPUFFT(plan) plan = args; cufftResult cures; _cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]); _cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]); int workerid = starpu_worker_get_id_check(); task_per_worker[workerid]++; cures = _cufftExecC2C(plan->plans[workerid].plan_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE); if (cures != CUFFT_SUCCESS) STARPU_CUFFT_REPORT_ERROR(cures); cudaStreamSynchronize(starpu_cuda_get_local_stream()); } #endif #ifdef STARPU_HAVE_FFTW /* Perform one fft of size n,m */ static void STARPUFFT(fft_3d_kernel_cpu)(void *descr[], void *_args) { STARPUFFT(plan) plan = _args; int workerid = starpu_worker_get_id_check(); task_per_worker[workerid]++; STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]); STARPUFFT(complex) * restrict out = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]); _FFTW(execute_dft)(plan->plans[workerid].plan_cpu, in, out); } #endif static struct starpu_perfmodel STARPUFFT(fft_3d_model) = { .type = STARPU_HISTORY_BASED, .symbol = TYPE"fft_3d" }; static struct starpu_codelet STARPUFFT(fft_3d_codelet) = { .where = #ifdef __STARPU_USE_CUDA STARPU_CUDA| #endif #ifdef STARPU_HAVE_FFTW STARPU_CPU| #endif 0, #ifdef __STARPU_USE_CUDA .cuda_funcs = {STARPUFFT(fft_3d_kernel_gpu)}, #endif #ifdef STARPU_HAVE_FFTW .cpu_funcs = {STARPUFFT(fft_3d_kernel_cpu)}, #endif CAN_EXECUTE .model = &STARPUFFT(fft_3d_model), .nbuffers = 2, .modes = {STARPU_R, STARPU_W}, .name = "fft_3d_codelet" }; STARPUFFT(plan) STARPUFFT(plan_dft_3d)(int n, int m, int p, int sign, unsigned flags) { unsigned workerid; if (PARALLEL) { /* TODO */ STARPU_ASSERT(0); } /* TODO: flags? Automatically set FFTW_MEASURE on calibration? */ STARPU_ASSERT(flags == 0); STARPUFFT(plan) plan = malloc(sizeof(*plan)); memset(plan, 0, sizeof(*plan)); plan->dim = 3; plan->n = malloc(plan->dim * sizeof(*plan->n)); plan->n[0] = n; plan->n[1] = m; plan->n[2] = p; plan->totsize = n * m; plan->type = C2C; plan->sign = sign; /* Initialize per-worker working set */ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) { switch (starpu_worker_get_type(workerid)) { case STARPU_CPU_WORKER: #ifdef STARPU_HAVE_FFTW /* fft plan: one fft of size n, m. */ plan->plans[workerid].plan_cpu = _FFTW(plan_dft_3d)(n, m, p, NULL, (void*) 1, sign, _FFTW_FLAGS); STARPU_ASSERT(plan->plans[workerid].plan_cpu); #else /* #warning libstarpufft can not work correctly if libfftw3 is not installed */ #endif break; case STARPU_CUDA_WORKER: break; default: /* Do not care, we won't be executing anything there. */ break; } } #ifdef __STARPU_USE_CUDA starpu_execute_on_each_worker(STARPUFFT(fft_3d_plan_gpu), plan, STARPU_CUDA); #endif return plan; } /* Actually submit all the tasks. */ static struct starpu_task * STARPUFFT(start3dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out) { STARPU_ASSERT(plan->type == C2C); int ret; if (PARALLEL) { /* TODO */ STARPU_ASSERT(0); } else /* !PARALLEL */ { struct starpu_task *task; /* Create FFT task */ task = starpu_task_create(); task->detach = 0; task->cl = &STARPUFFT(fft_3d_codelet); task->handles[0] = in; task->handles[1] = out; task->cl_arg = plan; ret = starpu_task_submit(task); if (ret == -ENODEV) return NULL; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); return task; } }