starpu_audio_processing.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2012-2013 Inria
  4. * Copyright (C) 2009-2015,2017 Université de Bordeaux
  5. * Copyright (C) 2010 Mehdi Juhoor
  6. * Copyright (C) 2010-2013,2015-2017 CNRS
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. /*
  20. * This reads a wave file, splits it into chunks, and on each of them run a
  21. * task which performs an fft, drop some high and low frequencies, and performs
  22. * the inverse fft. It then writes the output to a wave file.
  23. */
  24. #include <stdio.h>
  25. #include <stdlib.h>
  26. #include <math.h>
  27. #include <string.h>
  28. #include <sys/types.h>
  29. #include <starpu.h>
  30. #include <fftw3.h>
  31. #ifdef STARPU_USE_CUDA
  32. #include <cufft.h>
  33. #include <starpu_cublas_v2.h>
  34. #endif
  35. /* #define SAVE_RAW 1 */
  36. #define DEFAULTINPUTFILE "input.wav"
  37. #define DEFAULTOUTPUTFILE "output.wav"
  38. #define NSAMPLES (256*1024)
  39. #define SAMPLERATE 44100
  40. static unsigned nsamples = NSAMPLES;
  41. /* This is a band filter, we want to stop everything that is not between LOWFREQ and HIGHFREQ*/
  42. /* LOWFREQ < i * SAMPLERATE / NSAMPLE */
  43. #define LOWFREQ 500U
  44. #define HIFREQ 800U
  45. static const size_t headersize = 37+9;
  46. static FILE *infile, *outfile;
  47. static FILE *infile_raw, *outfile_raw;
  48. static char *inputfilename = DEFAULTINPUTFILE;
  49. static char *outputfilename = DEFAULTOUTPUTFILE;
  50. static unsigned use_pin = 0;
  51. unsigned length_data;
  52. /* buffer containing input WAV data */
  53. float *A;
  54. starpu_data_handle_t A_handle;
  55. /* For performance evaluation */
  56. static double start;
  57. static double end;
  58. static unsigned task_per_worker[STARPU_NMAXWORKERS] = {0};
  59. /*
  60. * Functions to Manipulate WAV files
  61. */
  62. unsigned get_wav_data_bytes_length(FILE *file)
  63. {
  64. /* this is clearly suboptimal !! */
  65. fseek(file, headersize, SEEK_SET);
  66. unsigned cnt = 0;
  67. while (fgetc(file) != EOF)
  68. cnt++;
  69. return cnt;
  70. }
  71. void copy_wav_header(FILE *srcfile, FILE *dstfile)
  72. {
  73. unsigned char buffer[128];
  74. fseek(srcfile, 0, SEEK_SET);
  75. fseek(dstfile, 0, SEEK_SET);
  76. fread(buffer, 1, headersize, infile);
  77. fwrite(buffer, 1, headersize, outfile);
  78. }
  79. void read_16bit_wav(FILE *infile, unsigned size, float *arrayout, FILE *save_file)
  80. {
  81. int v;
  82. #if SAVE_RAW
  83. unsigned currentpos = 0;
  84. #endif
  85. /* we skip the header to only keep the data */
  86. fseek(infile, headersize, SEEK_SET);
  87. for (v=0;v<size;v++)
  88. {
  89. signed char val = (signed char)fgetc(infile);
  90. signed char val2 = (signed char)fgetc(infile);
  91. arrayout[v] = 256*val2 + val;
  92. #if SAVE_RAW
  93. fprintf(save_file, "%u %f\n", currentpos++, arrayout[v]);
  94. #endif
  95. }
  96. }
  97. /* we only write the data, not the header !*/
  98. void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_file)
  99. {
  100. int v;
  101. #if SAVE_RAW
  102. unsigned currentpos = 0;
  103. #endif
  104. /* we assume that the header is copied using copy_wav_header */
  105. fseek(outfile, headersize, SEEK_SET);
  106. for (v=0;v<size;v++)
  107. {
  108. signed char val = ((int)arrayin[v]) % 256;
  109. signed char val2 = ((int)arrayin[v]) / 256;
  110. fputc(val, outfile);
  111. fputc(val2, outfile);
  112. #if SAVE_RAW
  113. if (save_file)
  114. fprintf(save_file, "%u %f\n", currentpos++, arrayin[v]);
  115. #endif
  116. }
  117. }
  118. /*
  119. *
  120. * The actual kernels
  121. *
  122. */
  123. /* we don't reinitialize the CUFFT plan for every kernel, so we "cache" it */
  124. typedef struct
  125. {
  126. unsigned is_initialized;
  127. #ifdef STARPU_USE_CUDA
  128. cufftHandle plan;
  129. cufftHandle inv_plan;
  130. cufftComplex *localout;
  131. #endif
  132. fftwf_complex *localout_cpu;
  133. float *Acopy;
  134. fftwf_plan plan_cpu;
  135. fftwf_plan inv_plan_cpu;
  136. } fft_plan_cache;
  137. static fft_plan_cache plans[STARPU_NMAXWORKERS];
  138. #ifdef STARPU_USE_CUDA
  139. static void band_filter_kernel_gpu(void *descr[], void *arg)
  140. {
  141. cufftResult cures;
  142. float *localA = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  143. cufftComplex *localout;
  144. int workerid = starpu_worker_get_id();
  145. /* initialize the plane only during the first iteration */
  146. if (!plans[workerid].is_initialized)
  147. {
  148. cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
  149. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  150. cufftSetStream(plans[workerid].plan, starpu_cuda_get_local_stream());
  151. cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
  152. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  153. cufftSetStream(plans[workerid].inv_plan, starpu_cuda_get_local_stream());
  154. cudaMalloc((void **)&plans[workerid].localout,
  155. nsamples*sizeof(cufftComplex));
  156. STARPU_ASSERT(plans[workerid].localout);
  157. plans[workerid].is_initialized = 1;
  158. }
  159. localout = plans[workerid].localout;
  160. /* FFT */
  161. cures = cufftExecR2C(plans[workerid].plan, localA, localout);
  162. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  163. /* filter low freqs */
  164. unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
  165. cudaMemsetAsync(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
  166. /* filter high freqs */
  167. unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
  168. cudaMemsetAsync(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
  169. /* inverse FFT */
  170. cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
  171. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  172. /* FFTW does not normalize its output ! */
  173. float scal = 1.0f/nsamples;
  174. cublasStatus_t status = cublasSscal (starpu_cublas_get_local_handle(), nsamples, &scal, localA, 1);
  175. if (status != CUBLAS_STATUS_SUCCESS)
  176. STARPU_CUBLAS_REPORT_ERROR(status);
  177. }
  178. #endif
  179. static starpu_pthread_mutex_t fftw_mutex = PTHREAD_MUTEX_INITIALIZER;
  180. static void band_filter_kernel_cpu(void *descr[], void *arg)
  181. {
  182. float *localA = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  183. int workerid = starpu_worker_get_id();
  184. /* initialize the plane only during the first iteration */
  185. if (!plans[workerid].is_initialized)
  186. {
  187. plans[workerid].localout_cpu = malloc(nsamples*sizeof(fftwf_complex));
  188. plans[workerid].Acopy = malloc(nsamples*sizeof(float));
  189. /* create plans, only "fftwf_execute" is thread safe in FFTW ... */
  190. STARPU_PTHREAD_MUTEX_LOCK(&fftw_mutex);
  191. plans[workerid].plan_cpu = fftwf_plan_dft_r2c_1d(nsamples,
  192. plans[workerid].Acopy,
  193. plans[workerid].localout_cpu,
  194. FFTW_ESTIMATE);
  195. plans[workerid].inv_plan_cpu = fftwf_plan_dft_c2r_1d(nsamples,
  196. plans[workerid].localout_cpu,
  197. plans[workerid].Acopy,
  198. FFTW_ESTIMATE);
  199. STARPU_PTHREAD_MUTEX_UNLOCK(&fftw_mutex);
  200. plans[workerid].is_initialized = 1;
  201. }
  202. fftwf_complex *localout = plans[workerid].localout_cpu;
  203. /* copy data into the temporary buffer */
  204. memcpy(plans[workerid].Acopy, localA, nsamples*sizeof(float));
  205. /* FFT */
  206. fftwf_execute(plans[workerid].plan_cpu);
  207. /* filter low freqs */
  208. unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
  209. memset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
  210. /* filter high freqs */
  211. unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
  212. memset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
  213. /* inverse FFT */
  214. fftwf_execute(plans[workerid].inv_plan_cpu);
  215. /* copy data into the temporary buffer */
  216. memcpy(localA, plans[workerid].Acopy, nsamples*sizeof(float));
  217. /* FFTW does not normalize its output ! */
  218. /* TODO use BLAS ?*/
  219. int i;
  220. for (i = 0; i < nsamples; i++)
  221. localA[i] /= nsamples;
  222. }
  223. struct starpu_perfmodel band_filter_model =
  224. {
  225. .type = STARPU_HISTORY_BASED,
  226. .symbol = "FFT_band_filter"
  227. };
  228. static struct starpu_codelet band_filter_cl =
  229. {
  230. .modes = { STARPU_RW },
  231. #ifdef STARPU_USE_CUDA
  232. .cuda_funcs = {band_filter_kernel_gpu},
  233. .cuda_flags = {STARPU_CUDA_ASYNC},
  234. #endif
  235. .cpu_funcs = {band_filter_kernel_cpu},
  236. .model = &band_filter_model,
  237. .nbuffers = 1
  238. };
  239. void callback(void *arg)
  240. {
  241. /* do some accounting */
  242. int id = starpu_worker_get_id();
  243. task_per_worker[id]++;
  244. }
  245. void create_starpu_task(unsigned iter)
  246. {
  247. int ret;
  248. struct starpu_task *task = starpu_task_create();
  249. task->cl = &band_filter_cl;
  250. task->handles[0] = starpu_data_get_sub_data(A_handle, 1, iter);
  251. task->callback_func = callback;
  252. task->callback_arg = NULL;
  253. ret = starpu_task_submit(task);
  254. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  255. }
  256. static void init_problem(void)
  257. {
  258. infile = fopen(inputfilename, "r");
  259. if (outputfilename)
  260. outfile = fopen(outputfilename, "w+");
  261. #if SAVE_RAW
  262. infile_raw = fopen("input.raw", "w");
  263. outfile_raw = fopen("output.raw", "w");
  264. #endif
  265. /* copy input's header into output WAV */
  266. if (outputfilename)
  267. copy_wav_header(infile, outfile);
  268. /* read length of input WAV's data */
  269. /* each element is 2 bytes long (16bits)*/
  270. length_data = get_wav_data_bytes_length(infile)/2;
  271. while (nsamples > length_data)
  272. nsamples /= 2;
  273. /* allocate a buffer to store the content of input file */
  274. if (use_pin)
  275. {
  276. starpu_malloc((void **)&A, length_data*sizeof(float));
  277. }
  278. else
  279. {
  280. A = malloc(length_data*sizeof(float));
  281. }
  282. /* allocate working buffer (this could be done online, but we'll keep it simple) */
  283. /* starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex)); */
  284. /* read input data into buffer "A" */
  285. read_16bit_wav(infile, length_data, A, infile_raw);
  286. }
  287. static void parse_args(int argc, char **argv)
  288. {
  289. int i;
  290. for (i = 1; i < argc; i++)
  291. {
  292. if (strcmp(argv[i], "-h") == 0)
  293. {
  294. fprintf(stderr, "Usage: %s [-pin] [-nsamples block_size] [-i input.wav] [-o output.wav | -no-output] [-h]\n", argv[0]);
  295. exit(-1);
  296. }
  297. if (strcmp(argv[i], "-i") == 0)
  298. {
  299. inputfilename = argv[++i];
  300. }
  301. if (strcmp(argv[i], "-o") == 0)
  302. {
  303. outputfilename = argv[++i];
  304. }
  305. if (strcmp(argv[i], "-no-output") == 0)
  306. {
  307. outputfilename = NULL;
  308. }
  309. /* block size */
  310. if (strcmp(argv[i], "-nsamples") == 0)
  311. {
  312. char *argptr;
  313. nsamples = strtol(argv[++i], &argptr, 10);
  314. }
  315. if (strcmp(argv[i], "-pin") == 0)
  316. {
  317. use_pin = 1;
  318. }
  319. }
  320. }
  321. int main(int argc, char **argv)
  322. {
  323. unsigned iter;
  324. int ret;
  325. parse_args(argc, argv);
  326. fprintf(stderr, "Reading input data\n");
  327. init_problem();
  328. unsigned niter = length_data/nsamples;
  329. fprintf(stderr, "input: %s\noutput: %s\n#chunks %u\n", inputfilename, outputfilename, niter);
  330. /* launch StarPU */
  331. ret = starpu_init(NULL);
  332. if (ret == -ENODEV)
  333. return 77;
  334. STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
  335. starpu_cublas_init();
  336. starpu_vector_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, niter*nsamples, sizeof(float));
  337. struct starpu_data_filter f =
  338. {
  339. .filter_func = starpu_vector_filter_block,
  340. .nchildren = niter
  341. };
  342. starpu_data_partition(A_handle, &f);
  343. for (iter = 0; iter < niter; iter++)
  344. starpu_data_set_wt_mask(starpu_data_get_sub_data(A_handle, 1, iter), 1<<0);
  345. start = starpu_timing_now();
  346. for (iter = 0; iter < niter; iter++)
  347. {
  348. create_starpu_task(iter);
  349. }
  350. starpu_task_wait_for_all();
  351. end = starpu_timing_now();
  352. double timing = end - start;
  353. fprintf(stderr, "Computation took %2.2f ms\n", timing/1000);
  354. int worker;
  355. for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
  356. {
  357. if (task_per_worker[worker])
  358. {
  359. char name[32];
  360. starpu_worker_get_name(worker, name, 32);
  361. unsigned long bytes = nsamples*sizeof(float)*task_per_worker[worker];
  362. fprintf(stderr, "\t%s -> %2.2f MB\t%2.2f\tMB/s\t%2.2f %%\n", name, (1.0*bytes)/(1024*1024), bytes/timing, (100.0*task_per_worker[worker])/niter);
  363. }
  364. }
  365. if (outputfilename)
  366. fprintf(stderr, "Writing output data\n");
  367. /* make sure that the output is in RAM before quitting StarPU */
  368. starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
  369. starpu_data_unregister(A_handle);
  370. starpu_cublas_shutdown();
  371. /* we are done ! */
  372. starpu_shutdown();
  373. fclose(infile);
  374. if (outputfilename)
  375. {
  376. write_16bit_wav(outfile, length_data, A, outfile_raw);
  377. fclose(outfile);
  378. }
  379. #if SAVE_RAW
  380. fclose(infile_raw);
  381. fclose(outfile_raw);
  382. #endif
  383. return 0;
  384. }