starpu-audio-processing.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <stdio.h>
  17. #include <stdlib.h>
  18. #include <math.h>
  19. #include <string.h>
  20. #include <pthread.h>
  21. #include <sys/types.h>
  22. #include <sys/time.h>
  23. #include <starpu.h>
  24. #include <fftw3.h>
  25. #ifdef USE_CUDA
  26. #include <cufft.h>
  27. #endif
  28. //#define SAVE_RAW 1
  29. #define DEFAULTINPUTFILE "input.wav"
  30. #define DEFAULTOUTPUTFILE "output.wav"
  31. #define NSAMPLES (256*1024)
  32. #define SAMPLERATE 44100
  33. static unsigned nsamples = NSAMPLES;
  34. /* This is a band filter, we want to stop everything that is not between LOWFREQ and HIGHFREQ*/
  35. /* LOWFREQ < i * SAMPLERATE / NSAMPLE */
  36. #define LOWFREQ 500U
  37. #define HIFREQ 800U
  38. static const size_t headersize = 37+9;
  39. static FILE *infile, *outfile;
  40. static FILE *infile_raw, *outfile_raw;
  41. static char *inputfilename = DEFAULTINPUTFILE;
  42. static char *outputfilename = DEFAULTOUTPUTFILE;
  43. static unsigned use_pin = 0;
  44. unsigned length_data;
  45. /* buffer containing input WAV data */
  46. float *A;
  47. starpu_data_handle A_handle;
  48. /* For performance evaluation */
  49. static struct timeval start;
  50. static struct timeval end;
  51. static unsigned task_per_worker[STARPU_NMAXWORKERS] = {0};
  52. /*
  53. * Functions to Manipulate WAV files
  54. */
  55. unsigned get_wav_data_bytes_length(FILE *file)
  56. {
  57. /* this is clearly suboptimal !! */
  58. fseek(file, headersize, SEEK_SET);
  59. unsigned cnt = 0;
  60. while (fgetc(file) != EOF)
  61. cnt++;
  62. return cnt;
  63. }
  64. void copy_wav_header(FILE *srcfile, FILE *dstfile)
  65. {
  66. unsigned char buffer[128];
  67. fseek(srcfile, 0, SEEK_SET);
  68. fseek(dstfile, 0, SEEK_SET);
  69. fread(buffer, 1, headersize, infile);
  70. fwrite(buffer, 1, headersize, outfile);
  71. }
  72. void read_16bit_wav(FILE *infile, unsigned size, float *arrayout, FILE *save_file)
  73. {
  74. int v;
  75. #if SAVE_RAW
  76. unsigned currentpos = 0;
  77. #endif
  78. /* we skip the header to only keep the data */
  79. fseek(infile, headersize, SEEK_SET);
  80. for (v=0;v<size;v++) {
  81. signed char val = (signed char)fgetc(infile);
  82. signed char val2 = (signed char)fgetc(infile);
  83. arrayout[v] = 256*val2 + val;
  84. #if SAVE_RAW
  85. fprintf(save_file, "%d %f\n", currentpos++, arrayout[v]);
  86. #endif
  87. }
  88. }
  89. /* we only write the data, not the header !*/
  90. void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_file)
  91. {
  92. int v;
  93. #if SAVE_RAW
  94. unsigned currentpos = 0;
  95. #endif
  96. /* we assume that the header is copied using copy_wav_header */
  97. fseek(outfile, headersize, SEEK_SET);
  98. for (v=0;v<size;v++) {
  99. signed char val = ((int)arrayin[v]) % 256;
  100. signed char val2 = ((int)arrayin[v]) / 256;
  101. fputc(val, outfile);
  102. fputc(val2, outfile);
  103. #if SAVE_RAW
  104. if (save_file)
  105. fprintf(save_file, "%d %f\n", currentpos++, arrayin[v]);
  106. #endif
  107. }
  108. }
  109. /*
  110. *
  111. * The actual kernels
  112. *
  113. */
  114. /* we don't reinitialize the CUFFT plan for every kernel, so we "cache" it */
  115. typedef struct {
  116. unsigned is_initialized;
  117. #ifdef USE_CUDA
  118. cufftHandle plan;
  119. cufftHandle inv_plan;
  120. cufftComplex *localout;
  121. #endif
  122. fftwf_complex *localout_cpu;
  123. float *Acopy;
  124. fftwf_plan plan_cpu;
  125. fftwf_plan inv_plan_cpu;
  126. } fft_plan_cache;
  127. static fft_plan_cache plans[STARPU_NMAXWORKERS];
  128. #ifdef USE_CUDA
  129. static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *arg)
  130. {
  131. cufftResult cures;
  132. float *localA = (float *)GET_VECTOR_PTR(descr[0]);
  133. cufftComplex *localout;
  134. int workerid = starpu_get_worker_id();
  135. /* initialize the plane only during the first iteration */
  136. if (!plans[workerid].is_initialized)
  137. {
  138. cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
  139. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  140. cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
  141. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  142. cudaMalloc((void **)&plans[workerid].localout,
  143. nsamples*sizeof(cufftComplex));
  144. STARPU_ASSERT(plans[workerid].localout);
  145. plans[workerid].is_initialized = 1;
  146. }
  147. localout = plans[workerid].localout;
  148. /* FFT */
  149. cures = cufftExecR2C(plans[workerid].plan, localA, localout);
  150. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  151. /* filter low freqs */
  152. unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
  153. cudaMemset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
  154. /* filter high freqs */
  155. unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
  156. cudaMemset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
  157. /* inverse FFT */
  158. cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
  159. STARPU_ASSERT(cures == CUFFT_SUCCESS);
  160. /* FFTW does not normalize its output ! */
  161. cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
  162. }
  163. #endif
  164. static pthread_mutex_t fftw_mutex = PTHREAD_MUTEX_INITIALIZER;
  165. static void band_filter_kernel_cpu(void *descr[], __attribute__((unused)) void *arg)
  166. {
  167. float *localA = (float *)GET_VECTOR_PTR(descr[0]);
  168. int workerid = starpu_get_worker_id();
  169. /* initialize the plane only during the first iteration */
  170. if (!plans[workerid].is_initialized)
  171. {
  172. plans[workerid].localout_cpu = malloc(nsamples*sizeof(fftwf_complex));
  173. plans[workerid].Acopy = malloc(nsamples*sizeof(float));
  174. /* create plans, only "fftwf_execute" is thread safe in FFTW ... */
  175. pthread_mutex_lock(&fftw_mutex);
  176. plans[workerid].plan_cpu = fftwf_plan_dft_r2c_1d(nsamples,
  177. plans[workerid].Acopy,
  178. plans[workerid].localout_cpu,
  179. FFTW_ESTIMATE);
  180. plans[workerid].inv_plan_cpu = fftwf_plan_dft_c2r_1d(nsamples,
  181. plans[workerid].localout_cpu,
  182. plans[workerid].Acopy,
  183. FFTW_ESTIMATE);
  184. pthread_mutex_unlock(&fftw_mutex);
  185. plans[workerid].is_initialized = 1;
  186. }
  187. fftwf_complex *localout = plans[workerid].localout_cpu;
  188. /* copy data into the temporary buffer */
  189. memcpy(plans[workerid].Acopy, localA, nsamples*sizeof(float));
  190. /* FFT */
  191. fftwf_execute(plans[workerid].plan_cpu);
  192. /* filter low freqs */
  193. unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
  194. memset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
  195. /* filter high freqs */
  196. unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
  197. memset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
  198. /* inverse FFT */
  199. fftwf_execute(plans[workerid].inv_plan_cpu);
  200. /* copy data into the temporary buffer */
  201. memcpy(localA, plans[workerid].Acopy, nsamples*sizeof(float));
  202. /* FFTW does not normalize its output ! */
  203. /* TODO use BLAS ?*/
  204. int i;
  205. for (i = 0; i < nsamples; i++)
  206. localA[i] /= nsamples;
  207. }
  208. struct starpu_perfmodel_t band_filter_model = {
  209. .type = HISTORY_BASED,
  210. .symbol = "FFT_band_filter"
  211. };
  212. static starpu_codelet band_filter_cl = {
  213. .where = CORE|CUDA,
  214. #ifdef USE_CUDA
  215. .cuda_func = band_filter_kernel_gpu,
  216. #endif
  217. .core_func = band_filter_kernel_cpu,
  218. .model = &band_filter_model,
  219. .nbuffers = 1
  220. };
  221. void callback(void *arg)
  222. {
  223. /* do some accounting */
  224. int id = starpu_get_worker_id();
  225. task_per_worker[id]++;
  226. }
  227. void create_starpu_task(unsigned iter)
  228. {
  229. struct starpu_task *task = starpu_task_create();
  230. task->cl = &band_filter_cl;
  231. task->buffers[0].handle = get_sub_data(A_handle, 1, iter);
  232. task->buffers[0].mode = STARPU_RW;
  233. task->callback_func = callback;
  234. task->callback_arg = NULL;
  235. starpu_submit_task(task);
  236. }
  237. static void init_problem(void)
  238. {
  239. infile = fopen(inputfilename, "r");
  240. if (outputfilename)
  241. outfile = fopen(outputfilename, "w+");
  242. #if SAVE_RAW
  243. infile_raw = fopen("input.raw", "w");
  244. outfile_raw = fopen("output.raw", "w");
  245. #endif
  246. /* copy input's header into output WAV */
  247. if (outputfilename)
  248. copy_wav_header(infile, outfile);
  249. /* read length of input WAV's data */
  250. /* each element is 2 bytes long (16bits)*/
  251. length_data = get_wav_data_bytes_length(infile)/2;
  252. /* allocate a buffer to store the content of input file */
  253. if (use_pin)
  254. {
  255. starpu_malloc_pinned_if_possible((void **)&A, length_data*sizeof(float));
  256. }
  257. else {
  258. A = malloc(length_data*sizeof(float));
  259. }
  260. /* allocate working buffer (this could be done online, but we'll keep it simple) */
  261. //starpu_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex));
  262. /* read input data into buffer "A" */
  263. read_16bit_wav(infile, length_data, A, infile_raw);
  264. }
  265. static void parse_args(int argc, char **argv)
  266. {
  267. int i;
  268. for (i = 1; i < argc; i++) {
  269. if (strcmp(argv[i], "-h") == 0) {
  270. fprintf(stderr, "Usage: %s [-pin] [-nsamples block_size] [-i input.wav] [-o output.wav | -no-output] [-h]\n", argv[0]);
  271. exit(-1);
  272. }
  273. if (strcmp(argv[i], "-i") == 0) {
  274. inputfilename = argv[++i];;
  275. }
  276. if (strcmp(argv[i], "-o") == 0) {
  277. outputfilename = argv[++i];;
  278. }
  279. if (strcmp(argv[i], "-no-output") == 0) {
  280. outputfilename = NULL;;
  281. }
  282. /* block size */
  283. if (strcmp(argv[i], "-nsamples") == 0) {
  284. char *argptr;
  285. nsamples = strtol(argv[++i], &argptr, 10);
  286. }
  287. if (strcmp(argv[i], "-pin") == 0) {
  288. use_pin = 1;
  289. }
  290. }
  291. }
  292. int main(int argc, char **argv)
  293. {
  294. unsigned iter;
  295. parse_args(argc, argv);
  296. fprintf(stderr, "Reading input data\n");
  297. init_problem();
  298. unsigned niter = length_data/nsamples;
  299. fprintf(stderr, "input: %s\noutput: %s\n#chunks %d\n", inputfilename, outputfilename, niter);
  300. /* launch StarPU */
  301. starpu_init(NULL);
  302. starpu_register_vector_data(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
  303. starpu_filter f =
  304. {
  305. .filter_func = starpu_block_filter_func_vector,
  306. .filter_arg = niter
  307. };
  308. starpu_partition_data(A_handle, &f);
  309. for (iter = 0; iter < niter; iter++)
  310. starpu_data_set_wb_mask(get_sub_data(A_handle, 1, iter), 1<<0);
  311. gettimeofday(&start, NULL);
  312. for (iter = 0; iter < niter; iter++)
  313. {
  314. create_starpu_task(iter);
  315. }
  316. starpu_wait_all_tasks();
  317. gettimeofday(&end, NULL);
  318. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  319. fprintf(stderr, "Computation took %2.2f ms\n", timing/1000);
  320. int worker;
  321. for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
  322. {
  323. if (task_per_worker[worker])
  324. {
  325. char name[32];
  326. starpu_get_worker_name(worker, name, 32);
  327. unsigned long bytes = nsamples*sizeof(float)*task_per_worker[worker];
  328. fprintf(stderr, "\t%s -> %2.2f MB\t%2.2f\tMB/s\t%2.2f %%\n", name, (1.0*bytes)/(1024*1024), bytes/timing, (100.0*task_per_worker[worker])/niter);
  329. }
  330. }
  331. if (outputfilename)
  332. fprintf(stderr, "Writing output data\n");
  333. /* make sure that the output is in RAM before quitting StarPU */
  334. starpu_unpartition_data(A_handle, 0);
  335. starpu_delete_data(A_handle);
  336. /* we are done ! */
  337. starpu_shutdown();
  338. fclose(infile);
  339. if (outputfilename)
  340. {
  341. write_16bit_wav(outfile, length_data, A, outfile_raw);
  342. fclose(outfile);
  343. }
  344. #if SAVE_RAW
  345. fclose(infile_raw);
  346. fclose(outfile_raw);
  347. #endif
  348. return 0;
  349. }