starpu_audio_processing.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2011 Université de Bordeaux 1
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include <stdio.h>
  19. #include <stdlib.h>
  20. #include <math.h>
  21. #include <string.h>
  22. #include <pthread.h>
  23. #include <sys/types.h>
  24. #include <sys/time.h>
  25. #include <starpu.h>
  26. #include <fftw3.h>
  27. #ifdef STARPU_USE_CUDA
  28. #include <cufft.h>
  29. #endif
  30. /* #define SAVE_RAW 1 */
  31. #define DEFAULTINPUTFILE "input.wav"
  32. #define DEFAULTOUTPUTFILE "output.wav"
  33. #define NSAMPLES (256*1024)
  34. #define SAMPLERATE 44100
  35. static unsigned nsamples = NSAMPLES;
  36. /* This is a band filter, we want to stop everything that is not between LOWFREQ and HIGHFREQ*/
  37. /* LOWFREQ < i * SAMPLERATE / NSAMPLE */
  38. #define LOWFREQ 500U
  39. #define HIFREQ 800U
  40. static const size_t headersize = 37+9;
  41. static FILE *infile, *outfile;
  42. static FILE *infile_raw, *outfile_raw;
  43. static char *inputfilename = DEFAULTINPUTFILE;
  44. static char *outputfilename = DEFAULTOUTPUTFILE;
  45. static unsigned use_pin = 0;
  46. unsigned length_data;
  47. /* buffer containing input WAV data */
  48. float *A;
  49. starpu_data_handle_t A_handle;
  50. /* For performance evaluation */
  51. static struct timeval start;
  52. static struct timeval end;
  53. static unsigned task_per_worker[STARPU_NMAXWORKERS] = {0};
  54. /*
  55. * Functions to Manipulate WAV files
  56. */
  57. unsigned get_wav_data_bytes_length(FILE *file)
  58. {
  59. /* this is clearly suboptimal !! */
  60. fseek(file, headersize, SEEK_SET);
  61. unsigned cnt = 0;
  62. while (fgetc(file) != EOF)
  63. cnt++;
  64. return cnt;
  65. }
  66. void copy_wav_header(FILE *srcfile, FILE *dstfile)
  67. {
  68. unsigned char buffer[128];
  69. fseek(srcfile, 0, SEEK_SET);
  70. fseek(dstfile, 0, SEEK_SET);
  71. fread(buffer, 1, headersize, infile);
  72. fwrite(buffer, 1, headersize, outfile);
  73. }
  74. void read_16bit_wav(FILE *infile, unsigned size, float *arrayout, FILE *save_file)
  75. {
  76. int v;
  77. #if SAVE_RAW
  78. unsigned currentpos = 0;
  79. #endif
  80. /* we skip the header to only keep the data */
  81. fseek(infile, headersize, SEEK_SET);
  82. for (v=0;v<size;v++)
  83. {
  84. signed char val = (signed char)fgetc(infile);
  85. signed char val2 = (signed char)fgetc(infile);
  86. arrayout[v] = 256*val2 + val;
  87. #if SAVE_RAW
  88. fprintf(save_file, "%d %f\n", currentpos++, arrayout[v]);
  89. #endif
  90. }
  91. }
  92. /* we only write the data, not the header !*/
  93. void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_file)
  94. {
  95. int v;
  96. #if SAVE_RAW
  97. unsigned currentpos = 0;
  98. #endif
  99. /* we assume that the header is copied using copy_wav_header */
  100. fseek(outfile, headersize, SEEK_SET);
  101. for (v=0;v<size;v++)
  102. {
  103. signed char val = ((int)arrayin[v]) % 256;
  104. signed char val2 = ((int)arrayin[v]) / 256;
  105. fputc(val, outfile);
  106. fputc(val2, outfile);
  107. #if SAVE_RAW
  108. if (save_file)
  109. fprintf(save_file, "%d %f\n", currentpos++, arrayin[v]);
  110. #endif
  111. }
  112. }
  113. /*
  114. *
  115. * The actual kernels
  116. *
  117. */
  118. /* we don't reinitialize the CUFFT plan for every kernel, so we "cache" it */
  119. typedef struct
  120. {
  121. unsigned is_initialized;
  122. #ifdef STARPU_USE_CUDA
  123. cufftHandle plan;
  124. cufftHandle inv_plan;
  125. cufftComplex *localout;
  126. #endif
  127. fftwf_complex *localout_cpu;
  128. float *Acopy;
  129. fftwf_plan plan_cpu;
  130. fftwf_plan inv_plan_cpu;
  131. } fft_plan_cache;
  132. static fft_plan_cache plans[STARPU_NMAXWORKERS];
  133. #ifdef STARPU_USE_CUDA
  134. static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *arg)
  135. {
  136. cufftResult cures;
  137. float *localA = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  138. cufftComplex *localout;
  139. int workerid = starpu_worker_get_id();
  140. /* initialize the plane only during the first iteration */
  141. if (!plans[workerid].is_initialized)
  142. {
  143. cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
  144. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  145. cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
  146. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  147. cudaMalloc((void **)&plans[workerid].localout,
  148. nsamples*sizeof(cufftComplex));
  149. STARPU_ASSERT(plans[workerid].localout);
  150. plans[workerid].is_initialized = 1;
  151. }
  152. localout = plans[workerid].localout;
  153. /* FFT */
  154. cures = cufftExecR2C(plans[workerid].plan, localA, localout);
  155. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  156. /* filter low freqs */
  157. unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
  158. cudaMemset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
  159. /* filter high freqs */
  160. unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
  161. cudaMemset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
  162. /* inverse FFT */
  163. cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
  164. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  165. /* FFTW does not normalize its output ! */
  166. cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
  167. }
  168. #endif
  169. static pthread_mutex_t fftw_mutex = PTHREAD_MUTEX_INITIALIZER;
  170. static void band_filter_kernel_cpu(void *descr[], __attribute__((unused)) void *arg)
  171. {
  172. float *localA = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  173. int workerid = starpu_worker_get_id();
  174. /* initialize the plane only during the first iteration */
  175. if (!plans[workerid].is_initialized)
  176. {
  177. plans[workerid].localout_cpu = malloc(nsamples*sizeof(fftwf_complex));
  178. plans[workerid].Acopy = malloc(nsamples*sizeof(float));
  179. /* create plans, only "fftwf_execute" is thread safe in FFTW ... */
  180. pthread_mutex_lock(&fftw_mutex);
  181. plans[workerid].plan_cpu = fftwf_plan_dft_r2c_1d(nsamples,
  182. plans[workerid].Acopy,
  183. plans[workerid].localout_cpu,
  184. FFTW_ESTIMATE);
  185. plans[workerid].inv_plan_cpu = fftwf_plan_dft_c2r_1d(nsamples,
  186. plans[workerid].localout_cpu,
  187. plans[workerid].Acopy,
  188. FFTW_ESTIMATE);
  189. pthread_mutex_unlock(&fftw_mutex);
  190. plans[workerid].is_initialized = 1;
  191. }
  192. fftwf_complex *localout = plans[workerid].localout_cpu;
  193. /* copy data into the temporary buffer */
  194. memcpy(plans[workerid].Acopy, localA, nsamples*sizeof(float));
  195. /* FFT */
  196. fftwf_execute(plans[workerid].plan_cpu);
  197. /* filter low freqs */
  198. unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
  199. memset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
  200. /* filter high freqs */
  201. unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
  202. memset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
  203. /* inverse FFT */
  204. fftwf_execute(plans[workerid].inv_plan_cpu);
  205. /* copy data into the temporary buffer */
  206. memcpy(localA, plans[workerid].Acopy, nsamples*sizeof(float));
  207. /* FFTW does not normalize its output ! */
  208. /* TODO use BLAS ?*/
  209. int i;
  210. for (i = 0; i < nsamples; i++)
  211. localA[i] /= nsamples;
  212. }
  213. struct starpu_perfmodel band_filter_model =
  214. {
  215. .type = STARPU_HISTORY_BASED,
  216. .symbol = "FFT_band_filter"
  217. };
  218. static struct starpu_codelet band_filter_cl =
  219. {
  220. .modes = { STARPU_RW },
  221. .where = STARPU_CPU|STARPU_CUDA,
  222. #ifdef STARPU_USE_CUDA
  223. .cuda_funcs = {band_filter_kernel_gpu, NULL},
  224. #endif
  225. .cpu_funcs = {band_filter_kernel_cpu, NULL},
  226. .model = &band_filter_model,
  227. .nbuffers = 1
  228. };
  229. void callback(void *arg)
  230. {
  231. /* do some accounting */
  232. int id = starpu_worker_get_id();
  233. task_per_worker[id]++;
  234. }
  235. void create_starpu_task(unsigned iter)
  236. {
  237. int ret;
  238. struct starpu_task *task = starpu_task_create();
  239. task->cl = &band_filter_cl;
  240. task->handles[0] = starpu_data_get_sub_data(A_handle, 1, iter);
  241. task->callback_func = callback;
  242. task->callback_arg = NULL;
  243. ret = starpu_task_submit(task);
  244. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  245. }
  246. static void init_problem(void)
  247. {
  248. infile = fopen(inputfilename, "r");
  249. if (outputfilename)
  250. outfile = fopen(outputfilename, "w+");
  251. #if SAVE_RAW
  252. infile_raw = fopen("input.raw", "w");
  253. outfile_raw = fopen("output.raw", "w");
  254. #endif
  255. /* copy input's header into output WAV */
  256. if (outputfilename)
  257. copy_wav_header(infile, outfile);
  258. /* read length of input WAV's data */
  259. /* each element is 2 bytes long (16bits)*/
  260. length_data = get_wav_data_bytes_length(infile)/2;
  261. /* allocate a buffer to store the content of input file */
  262. if (use_pin)
  263. {
  264. starpu_malloc((void **)&A, length_data*sizeof(float));
  265. }
  266. else
  267. {
  268. A = malloc(length_data*sizeof(float));
  269. }
  270. /* allocate working buffer (this could be done online, but we'll keep it simple) */
  271. /* starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex)); */
  272. /* read input data into buffer "A" */
  273. read_16bit_wav(infile, length_data, A, infile_raw);
  274. }
  275. static void parse_args(int argc, char **argv)
  276. {
  277. int i;
  278. for (i = 1; i < argc; i++)
  279. {
  280. if (strcmp(argv[i], "-h") == 0)
  281. {
  282. fprintf(stderr, "Usage: %s [-pin] [-nsamples block_size] [-i input.wav] [-o output.wav | -no-output] [-h]\n", argv[0]);
  283. exit(-1);
  284. }
  285. if (strcmp(argv[i], "-i") == 0)
  286. {
  287. inputfilename = argv[++i];;
  288. }
  289. if (strcmp(argv[i], "-o") == 0)
  290. {
  291. outputfilename = argv[++i];;
  292. }
  293. if (strcmp(argv[i], "-no-output") == 0)
  294. {
  295. outputfilename = NULL;;
  296. }
  297. /* block size */
  298. if (strcmp(argv[i], "-nsamples") == 0)
  299. {
  300. char *argptr;
  301. nsamples = strtol(argv[++i], &argptr, 10);
  302. }
  303. if (strcmp(argv[i], "-pin") == 0)
  304. {
  305. use_pin = 1;
  306. }
  307. }
  308. }
  309. int main(int argc, char **argv)
  310. {
  311. unsigned iter;
  312. int ret;
  313. parse_args(argc, argv);
  314. fprintf(stderr, "Reading input data\n");
  315. init_problem();
  316. unsigned niter = length_data/nsamples;
  317. fprintf(stderr, "input: %s\noutput: %s\n#chunks %d\n", inputfilename, outputfilename, niter);
  318. /* launch StarPU */
  319. ret = starpu_init(NULL);
  320. STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
  321. starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
  322. struct starpu_data_filter f =
  323. {
  324. .filter_func = starpu_block_filter_func_vector,
  325. .nchildren = niter
  326. };
  327. starpu_data_partition(A_handle, &f);
  328. for (iter = 0; iter < niter; iter++)
  329. starpu_data_set_wt_mask(starpu_data_get_sub_data(A_handle, 1, iter), 1<<0);
  330. gettimeofday(&start, NULL);
  331. for (iter = 0; iter < niter; iter++)
  332. {
  333. create_starpu_task(iter);
  334. }
  335. starpu_task_wait_for_all();
  336. gettimeofday(&end, NULL);
  337. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  338. fprintf(stderr, "Computation took %2.2f ms\n", timing/1000);
  339. int worker;
  340. for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
  341. {
  342. if (task_per_worker[worker])
  343. {
  344. char name[32];
  345. starpu_worker_get_name(worker, name, 32);
  346. unsigned long bytes = nsamples*sizeof(float)*task_per_worker[worker];
  347. fprintf(stderr, "\t%s -> %2.2f MB\t%2.2f\tMB/s\t%2.2f %%\n", name, (1.0*bytes)/(1024*1024), bytes/timing, (100.0*task_per_worker[worker])/niter);
  348. }
  349. }
  350. if (outputfilename)
  351. fprintf(stderr, "Writing output data\n");
  352. /* make sure that the output is in RAM before quitting StarPU */
  353. starpu_data_unpartition(A_handle, 0);
  354. starpu_data_unregister(A_handle);
  355. /* we are done ! */
  356. starpu_shutdown();
  357. fclose(infile);
  358. if (outputfilename)
  359. {
  360. write_16bit_wav(outfile, length_data, A, outfile_raw);
  361. fclose(outfile);
  362. }
  363. #if SAVE_RAW
  364. fclose(infile_raw);
  365. fclose(outfile_raw);
  366. #endif
  367. return 0;
  368. }