apps.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855
  1. #include "apps.h"
  2. #include "scc_signals.h"
  3. #include "libfunctions.h"
  4. #include "my_rtrm.h"
  5. #include <time.h>
  6. #include <math.h>
  7. #define SWAP(a,b) {float tmp; tmp=a; a=b; b=tmp;}
  8. #define FFT_MAX 136192
  9. #define PAGE_SIZE 4096
  10. #define ARTIFICIAL_ROUND_DURATION_SEC 0.5
  11. #define ARTIFICIAL_ROUND_DURATION_NSEC 500000000 /* 500 ms */
  12. static float **svm_vectors, *svm_coef;
  13. static int *vector, **matrix;
  14. static float input_vector[D_sv];
  15. //static float matr_speedup[NUM_OF_MATRICES][MAX_WORKERS_COUNT];
  16. //static int matr_times[NUM_OF_MATRICES][MAX_WORKERS_COUNT];
  17. static float Exec_Speedup[MAX_WORKERS_COUNT];
  18. static int Exec_Latencies[MAX_WORKERS_COUNT];
  19. //static float **vectors, *coef;
  20. //2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE);
  21. static int P = 1; /* DEFAULT_P = 1 */
  22. static int M = 16; /* DEFAULT_M = 10 */
  23. static int N = 65536; /* N = 2^M */
  24. static int rootN = 256; /* rootN = sqrt(N) */
  25. static int num_cache_lines = 65536;
  26. #define PADLENGTH 2
  27. static float *x_local; /* x is the original time-domain data */
  28. static float *trans; /* trans is used as scratch space */
  29. static float *umain; /* umain is roots of unity for 1D FFTs */
  30. static float *umain2; /* umain2 is entire roots of unity matrix*/
  31. static float *upriv;
  32. void execute_workload_svm (int lower_bound, int upper_bound);
  33. void execute_workload_matrix (int lower_bound, int upper_bound);
  34. void matrix_transpose(int n1, float *src, float *dest, int node_id, int myFirst, int myLast, int pad_length);
  35. void FFT1D(int direction, int M, int N, float *x, float *scratch, float *upriv, float *umain2, int node_id, int myFirst, int myLast, int pad_length, int P);
  36. void copyColumn(int n1, float *src, float *dest);
  37. void single_FFT1D(int direction, int M, int N, float *u, float *x);
  38. void twiddle_Col(int direction, int n1, int N, int j, float *u, float *x, int pad_length);
  39. void reverse(int N, int M, float *x);
  40. int reverse_bit(int M, int k);
  41. void execute_workload_svm (int lower_bound, int upper_bound) {
  42. int i = 0, j = 0;
  43. float diff = 0, norma = 0, local_sum[N_sv];
  44. /* int vector_id = 0; Removed 16.02. Only one test vector */
  45. if (base_offset == -1) {
  46. base_offset = cur_agent.my_agent * N_sv;
  47. //fprintf(log_file, "My agent is %d. Calculated base_offset is %d\n",cur_agent.my_agent,base_offset);
  48. }
  49. for (i = lower_bound; i <= upper_bound; i++) {
  50. local_sum[i] = 0;
  51. scc_signals_check();
  52. for (j = 0; j < D_sv; j++){
  53. diff = svm_vectors[i][j] - input_vector[j];
  54. norma += diff*diff;
  55. }
  56. local_sum[i] += (float) (exp((double) (-gamma*norma))*svm_coef[i]);
  57. norma = 0;
  58. }
  59. for (i=lower_bound; i<=upper_bound; i++)
  60. manager_result_out[base_offset+i] = (int) local_sum[i];
  61. }
  62. void execute_workload_matrix (int lower_bound, int upper_bound) {
  63. int i, j, local_sum[MAX_ARRAY_SIZE];
  64. if (base_offset == -1) {
  65. //matrix_out = (int*) shmat (cur_agent.segment_id, NULL, 0);
  66. base_offset = cur_agent.my_agent * MAX_ARRAY_SIZE;
  67. }
  68. for (i=lower_bound; i<=upper_bound; i++) {
  69. local_sum[i] = 0;
  70. scc_signals_check();
  71. //signals_enable();
  72. for (j=0; j<cur_agent.array_size; j++)
  73. local_sum[i] += matrix[i][j] * vector[j];
  74. //signals_disable();
  75. }
  76. for (i=lower_bound; i<=upper_bound; i++)
  77. manager_result_out[base_offset+i] = local_sum[i];
  78. }
  79. void execute_workload_fft (int lower_bound, int upper_bound) {
  80. int work_id = 0, pad_length = PADLENGTH;
  81. if ((lower_bound == 0) && (upper_bound == FFT_MAX)) {
  82. P = 1;
  83. } else {
  84. P = 2;
  85. }
  86. /* FIXME works only because fft is restricted to two workers */
  87. if (lower_bound > 0) {
  88. work_id = 1;
  89. }
  90. FFT1D(1, M, N, x_local, trans, upriv, umain2, work_id, lower_bound, upper_bound, pad_length, P); //HACK node_id - 1 important!!
  91. }
  92. void execute_workload_artificial (int lower_bound, int upper_bound) {
  93. int AppSpeedup = upper_bound - lower_bound;
  94. struct timespec ts;
  95. //if (base_offset == -1) {
  96. // base_offset = cur_agent.my_agent * MAX_ARRAY_SIZE; /* FIXME Why is it always MAX_ARRAY_SIZE */
  97. //}
  98. ts.tv_sec = 0;
  99. ts.tv_nsec = ARTIFICIAL_ROUND_DURATION_NSEC / AppSpeedup;
  100. nanosleep(&ts, NULL);
  101. /*
  102. for (i=lower_bound; i<=upper_bound; i++) {
  103. sleep(ARTIFICIAL_ROUND_DURATION_SEC);
  104. }
  105. */
  106. /*
  107. for (i=lower_bound; i<=upper_bound; i++)
  108. manager_result_out[base_offset+i] = -1;
  109. */
  110. }
  111. void execute_workload (int lower_bound, int upper_bound) {
  112. if (executed_app == MATRIX_MUL) {
  113. execute_workload_matrix (lower_bound, upper_bound);
  114. } else if (executed_app == SVM) {
  115. execute_workload_svm (lower_bound, upper_bound);
  116. } else if (executed_app == FFT) {
  117. execute_workload_fft (lower_bound, upper_bound);
  118. } else if (executed_app == ARTIFICIAL) {
  119. execute_workload_artificial (lower_bound, upper_bound);
  120. }
  121. }
  122. void init_speedup_structs (void) {
  123. if (executed_app == MATRIX_MUL) {
  124. if (MATRIX_ARRAY_SIZE == 1024) {
  125. #ifdef PLAT_SCC
  126. Exec_Speedup[0] = 1.0;
  127. Exec_Speedup[1] = 1.188;
  128. Exec_Speedup[2] = 2.264;
  129. Exec_Speedup[3] = 3.0;
  130. Exec_Speedup[4] = 3.429;
  131. Exec_Speedup[5] = 4.0;
  132. Exec_Speedup[6] = 8.0;
  133. Exec_Speedup[7] = 0.0;
  134. Exec_Latencies[0] = 120;//29352;
  135. Exec_Latencies[1] = 101;//15112;
  136. Exec_Latencies[2] = 53;//11194;
  137. Exec_Latencies[3] = 40;//10313;
  138. Exec_Latencies[4] = 35;//8645;
  139. Exec_Latencies[5] = 30;//7871;
  140. Exec_Latencies[6] = 15;//6715;
  141. #else
  142. Exec_Speedup[0] = 1.0;
  143. Exec_Speedup[1] = 1.065;
  144. Exec_Speedup[2] = 1.270;
  145. Exec_Speedup[3] = 0.0;
  146. Exec_Speedup[4] = 0.0;
  147. Exec_Speedup[5] = 0.0;
  148. Exec_Speedup[6] = 0.0;
  149. Exec_Speedup[7] = 0.0;
  150. Exec_Latencies[0] = 100000000;//29352;
  151. Exec_Latencies[1] = 31;//15112;
  152. Exec_Latencies[2] = 29;//11194;
  153. Exec_Latencies[3] = 24;//10313;
  154. Exec_Latencies[4] = 0;//8645;
  155. Exec_Latencies[5] = 0;//7871;
  156. Exec_Latencies[6] = 0;//6715;
  157. Exec_Latencies[7] = 0;//7014;
  158. #endif
  159. } else if (MATRIX_ARRAY_SIZE == 2048) {
  160. #ifdef PLAT_SCC
  161. Exec_Speedup[0] = 1.0;
  162. Exec_Speedup[1] = 1.091;
  163. Exec_Speedup[2] = 1.2;
  164. Exec_Speedup[3] = 1.491;
  165. Exec_Speedup[4] = 1.791;
  166. Exec_Speedup[5] = 2.824;
  167. Exec_Speedup[6] = 3.0;
  168. Exec_Latencies[0] = 240;//112276;
  169. Exec_Latencies[1] = 220;//58880;
  170. Exec_Latencies[2] = 200;//40305;
  171. Exec_Latencies[3] = 161;//31705;
  172. Exec_Latencies[4] = 134;//28309;
  173. Exec_Latencies[5] = 85;//24512;
  174. Exec_Latencies[6] = 80;//22239;
  175. //matr_times[1][7] = 23;//20332;
  176. #else
  177. Exec_Speedup[0] = 1.0;
  178. Exec_Speedup[1] = 1.331;
  179. Exec_Speedup[2] = 2.009;
  180. Exec_Speedup[3] = 2.315;
  181. Exec_Speedup[4] = 2.572;
  182. Exec_Speedup[5] = 0.0;
  183. Exec_Speedup[6] = 0.0;
  184. Exec_Speedup[7] = 0.0;//5.522;
  185. Exec_Latencies[0] = 100000000;//112276;
  186. Exec_Latencies[1] = 116;//58880;
  187. Exec_Latencies[2] = 87;//40305;
  188. Exec_Latencies[3] = 58;//31705;
  189. Exec_Latencies[4] = 50;//28309;
  190. Exec_Latencies[5] = 45;//24512;
  191. Exec_Latencies[6] = 0;//22239;
  192. Exec_Latencies[7] = 0;//20332;
  193. #endif
  194. } else if (MATRIX_ARRAY_SIZE == 4096) {
  195. #ifdef PLAT_SCC
  196. Exec_Speedup[0] = 1.0;
  197. Exec_Speedup[1] = 2.001;
  198. Exec_Speedup[2] = 2.976;
  199. Exec_Speedup[3] = 4.032;
  200. Exec_Speedup[4] = 5.034;
  201. Exec_Speedup[5] = 6.25;
  202. Exec_Speedup[6] = 6.678;
  203. Exec_Speedup[7] = 6.819;
  204. Exec_Latencies[0] = 750;//384005;
  205. Exec_Latencies[1] = 374;//231583;
  206. Exec_Latencies[2] = 252;//157966;
  207. Exec_Latencies[3] = 186;//121222;
  208. Exec_Latencies[4] = 149;//101208;
  209. Exec_Latencies[5] = 120;//87852;
  210. Exec_Latencies[6] = 110;//78093;
  211. #else
  212. Exec_Speedup[0] = 1.0;
  213. Exec_Speedup[1] = 1.517;
  214. Exec_Speedup[2] = 1.958;
  215. Exec_Speedup[3] = 2.112;
  216. Exec_Speedup[4] = 2.878;
  217. Exec_Speedup[5] = 3.338;
  218. Exec_Speedup[6] = 4.241;
  219. Exec_Speedup[7] = 0.0;//5.073;
  220. Exec_Latencies[0] = 100000000;//384005;
  221. Exec_Latencies[1] = 431;//231583;
  222. Exec_Latencies[2] = 284;//157966;
  223. Exec_Latencies[3] = 220;//121222;
  224. Exec_Latencies[4] = 204;//101208;
  225. Exec_Latencies[5] = 150;//87852;
  226. Exec_Latencies[6] = 129;//78093;
  227. Exec_Latencies[7] = 102;//75690;
  228. #endif
  229. } else {
  230. printf("Unknown array size\n");
  231. exit(0);
  232. }
  233. } else if (executed_app == SVM) {
  234. /* N_sv 4096 D_sv 4096 */
  235. Exec_Speedup[0] = 1.0; /* 1 worker */
  236. Exec_Speedup[1] = 1.959;
  237. Exec_Speedup[2] = 2.919;
  238. Exec_Speedup[3] = 3.853;
  239. Exec_Speedup[4] = 4.777;
  240. Exec_Speedup[5] = 5.723;
  241. Exec_Speedup[6] = 6.644;
  242. Exec_Speedup[7] = 0.0;
  243. Exec_Latencies[0] = 578;
  244. Exec_Latencies[1] = 295;
  245. Exec_Latencies[2] = 198;
  246. Exec_Latencies[3] = 150;
  247. Exec_Latencies[4] = 121;
  248. Exec_Latencies[5] = 101;
  249. Exec_Latencies[6] = 87;
  250. Exec_Latencies[7] = 6; /* Irrelevant */
  251. } else if (executed_app == FFT) {
  252. Exec_Speedup[0] = 1.0; /* 1 worker */
  253. Exec_Speedup[1] = 1.55;
  254. Exec_Speedup[2] = 0;
  255. Exec_Speedup[3] = 0;
  256. Exec_Speedup[4] = 0;
  257. Exec_Speedup[5] = 0;
  258. Exec_Speedup[6] = 0;
  259. Exec_Speedup[7] = 0;
  260. Exec_Latencies[0] = 772;
  261. Exec_Latencies[1] = 498;
  262. Exec_Latencies[2] = 0;
  263. Exec_Latencies[3] = 0;
  264. Exec_Latencies[4] = 0;
  265. Exec_Latencies[5] = 0;
  266. Exec_Latencies[6] = 0;
  267. Exec_Latencies[7] = 0;
  268. } if (executed_app == ARTIFICIAL) {
  269. }
  270. }
  271. void app_init (char scen_directory[SCEN_DIR_SIZE], char scen_num[SCEN_NUM_SIZE]) {
  272. int i, j, pad_length = PADLENGTH;
  273. char buf[MAX_STR_NAME_SIZE], *buffer;
  274. FILE *matrix_input, *support_vectors_file, *coef_file, *test_vector_file, *umain_file, *umain2_file, *x_local_file;
  275. size_t bufsize = 32;
  276. if (executed_app == MATRIX_MUL) {
  277. cur_agent.array_size = MATRIX_ARRAY_SIZE;
  278. matrix = (int **) malloc(cur_agent.array_size * sizeof(int *));
  279. #ifdef PLAT_SCC
  280. strcpy(buf, "/shared/herc/");
  281. #else
  282. strcpy(buf, "../");
  283. #endif
  284. strcat(buf, scen_directory);
  285. strcat(buf, "/MATRIX-inputs/");
  286. strcat(buf, itoa(cur_agent.array_size));
  287. fprintf(log_file,"matrix file path = %s\n",buf);
  288. if ((matrix_input = fopen(buf, "r")) == NULL){
  289. printf("Cannot open input file with file path = %s ",buf);
  290. perror("open matrix_input");
  291. }
  292. for (i=0; i<cur_agent.array_size; i++) {
  293. matrix[i] = (int *) malloc(cur_agent.array_size * sizeof(int));
  294. for (j=0; j<cur_agent.array_size; j++)
  295. fscanf(matrix_input,"%d",&matrix[i][j]);
  296. }
  297. vector = (int *) malloc(cur_agent.array_size * sizeof(int));
  298. for (j=0; j<cur_agent.array_size; j++)
  299. fscanf(matrix_input,"%d",&vector[j]);
  300. fclose(matrix_input);
  301. } else if (executed_app == SVM) {
  302. #ifdef PLAT_SCC
  303. strcpy(buf, "/shared/herc/");
  304. #else
  305. strcpy(buf, "../");
  306. #endif
  307. strcat(buf,scen_directory);
  308. //strcat(buf,"/");
  309. //strcat(buf,scen_num);
  310. strcat(buf,"/SVM-inputs/support_vectors_N_sv_");
  311. strcat(buf,itoa(N_sv));
  312. strcat(buf,"_D_sv_");
  313. strcat(buf,itoa(D_sv));
  314. strcat(buf,".dat");
  315. fprintf(log_file,"svm file path = %s\n",buf);
  316. if ((support_vectors_file = fopen(buf,"r")) == NULL){
  317. printf("Cannot open input file with file path = %s ",buf);
  318. perror("open svm_input");
  319. }
  320. #ifdef PLAT_SCC
  321. strcpy(buf, "/shared/herc/");
  322. #else
  323. strcpy(buf, "../");
  324. #endif
  325. strcat(buf,scen_directory);
  326. //strcat(buf,"/");
  327. //strcat(buf,scen_num);
  328. strcat(buf,"/SVM-inputs/sv_coef_N_sv_");
  329. strcat(buf,itoa(N_sv));
  330. strcat(buf,"_D_sv_");
  331. strcat(buf,itoa(D_sv));
  332. strcat(buf,".dat");
  333. fprintf(log_file,"svm_coef file path = %s\n",buf);
  334. if ((coef_file = fopen(buf,"r")) == NULL){
  335. printf("Cannot open input file with file path = %s ",buf);
  336. perror("open svm_input");
  337. }
  338. #ifdef PLAT_SCC
  339. strcpy(buf, "/shared/herc/");
  340. #else
  341. strcpy(buf, "../");
  342. #endif
  343. strcat(buf,scen_directory);
  344. //strcat(buf,"/");
  345. //strcat(buf,scen_num);
  346. strcat(buf,"/SVM-inputs/test_vector_D_sv_");
  347. strcat(buf,itoa(D_sv));
  348. strcat(buf,".dat");
  349. fprintf(log_file,"test_vector file path = %s\n",buf);
  350. if ((test_vector_file = fopen(buf,"r")) == NULL){
  351. printf("Cannot open input file with file path = %s ",buf);
  352. perror("open svm_input");
  353. }
  354. svm_vectors = (float **)malloc(N_sv*sizeof(float *));
  355. if (svm_vectors == NULL){
  356. printf("--%d-- svm_vectors malloc fail!!\n", node_id);
  357. perror("malloc error");
  358. }
  359. svm_coef = (float *)malloc(N_sv*sizeof(float));
  360. if (svm_coef == NULL){
  361. printf("--%d-- svm_coef malloc fail!!\n", node_id);
  362. perror("malloc error");
  363. }
  364. buffer = (char *)malloc(bufsize * sizeof(char));
  365. for (i = 0; i < N_sv; i++) {
  366. svm_vectors[i] = (float *)malloc(D_sv*sizeof(float));
  367. if (svm_vectors[i] == NULL) {
  368. printf("--%d-- svm_vectors[%d] malloc fail!!\n", node_id, i);
  369. perror("malloc error");
  370. } else {
  371. for (j = 0; j < D_sv; j++) {
  372. /* Read support svm_vectors */
  373. if (j < D_sv){
  374. fscanf(support_vectors_file,"%f",&svm_vectors[i][j]);
  375. fgetc(support_vectors_file);
  376. }else{
  377. getline(&buffer,&bufsize,support_vectors_file);
  378. }
  379. }
  380. }
  381. }
  382. for (j = 0; j < N_sv; j++) {
  383. /* Read coefficients */
  384. fscanf(coef_file,"%f",&svm_coef[j]);
  385. fgetc(coef_file);
  386. }
  387. for (j = 0; j < D_sv; j++) {
  388. /* Read coefficients */
  389. fscanf(test_vector_file,"%f",&input_vector[j]);
  390. }
  391. cur_agent.array_size = -1;
  392. fclose(support_vectors_file);
  393. fclose(coef_file);
  394. fclose(test_vector_file);
  395. free(buffer);
  396. } else if (executed_app == FFT) {
  397. fprintf(log_file,"Initializing FFT application\n");
  398. x_local = (float *)malloc(2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE);
  399. if (x_local == NULL){
  400. printf("Malloc error for x_local\n");
  401. perror("malloc error");
  402. exit(-1);
  403. }
  404. trans = (float *)malloc(2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE);
  405. if (trans == NULL){
  406. printf("Malloc error for trans\n");
  407. perror("malloc error");
  408. exit(-1);
  409. }
  410. umain = (float *)malloc(2*rootN*sizeof(float));
  411. if (umain == NULL){
  412. printf("Malloc error for umain\n");
  413. perror("malloc error");
  414. exit(-1);
  415. }
  416. umain2 = (float *)malloc(2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE);
  417. if (umain2 == NULL){
  418. printf("Malloc error for umain2\n");
  419. perror("malloc error");
  420. exit(-1);
  421. }
  422. upriv = (float *)malloc(2*(rootN-1)*sizeof(float));
  423. if (upriv == NULL){
  424. printf("--%d-- Malloc error for upriv\n", node_id);
  425. perror("malloc error");
  426. exit(-1);
  427. }
  428. #ifdef PLAT_SCC
  429. strcpy(buf, "/shared/herc/");
  430. #else
  431. strcpy(buf, "../");
  432. #endif
  433. strcat(buf,scen_directory);
  434. //strcat(buf,"/");
  435. //strcat(buf,scen_num);
  436. strcat(buf,"/FFT-inputs/umain_file");
  437. fprintf(log_file,"umain_file file path = %s\n",buf);
  438. if ((umain_file = fopen(buf,"r")) == NULL){
  439. printf("Cannot open input file with file path = %s ",buf);
  440. perror("open fft_input");
  441. }
  442. for (i=0; i<2*rootN; i++) {
  443. fscanf(umain_file,"%f",&umain[i]);
  444. }
  445. fclose(umain_file);
  446. #ifdef PLAT_SCC
  447. strcpy(buf, "/shared/herc/");
  448. #else
  449. strcpy(buf, "../");
  450. #endif
  451. strcat(buf,scen_directory);
  452. //strcat(buf,"/");
  453. //strcat(buf,scen_num);
  454. strcat(buf,"/FFT-inputs/umain2_file");
  455. fprintf(log_file,"umain2_file file path = %s\n",buf);
  456. if ((umain2_file = fopen(buf,"r")) == NULL){
  457. printf("Cannot open input file with file path = %s ",buf);
  458. perror("open umain_file");
  459. }
  460. //for (i=0; i<2*(N+rootN*pad_length)+PAGE_SIZE; i++) {
  461. for (i=0; i<2*(N+rootN*pad_length); i++) {
  462. fscanf(umain2_file,"%f",&umain2[i]);
  463. }
  464. fclose(umain2_file);
  465. #ifdef PLAT_SCC
  466. strcpy(buf, "/shared/herc/");
  467. #else
  468. strcpy(buf, "../");
  469. #endif
  470. strcat(buf,scen_directory);
  471. //strcat(buf,"/");
  472. //strcat(buf,scen_num);
  473. strcat(buf,"/FFT-inputs/x_local_file");
  474. fprintf(log_file,"x_local_file file path = %s\n",buf);
  475. if ((x_local_file = fopen(buf,"r")) == NULL){
  476. printf("Cannot open input file with file path = %s ",buf);
  477. perror("open x_local_file");
  478. }
  479. //for (i=0;i<2*(N+rootN*pad_length)+PAGE_SIZE;i++) {
  480. for (i=0; i<2*(N+rootN*pad_length); i++) {
  481. fscanf(x_local_file,"%f",&x_local[i]);
  482. }
  483. fclose(x_local_file);
  484. for (i = 0; i < 2*(rootN-1); i++){
  485. upriv[i] = umain[i];
  486. }
  487. } else if (executed_app == MATRIX_MUL) {
  488. }
  489. }
  490. int get_max_cores_count(app cur_app){
  491. #ifdef SINGLE_WORKER
  492. return 2;
  493. #elif ARTIFICIAL_APPS_SIM
  494. int tmp_max_cores=MAX_WORKERS_COUNT; /* FIXME 31.10.2017 unable to send more than 8 workers via MPB */
  495. if (cur_app.var < 1.0) {
  496. tmp_max_cores = (int) ceilf(2.0*cur_app.A - 1);
  497. } else {
  498. tmp_max_cores = (int) ceilf(cur_app.A + (cur_app.A*cur_app.var) - cur_app.var);
  499. }
  500. if (tmp_max_cores < MAX_WORKERS_COUNT) {
  501. return tmp_max_cores;
  502. } else {
  503. return MAX_WORKERS_COUNT;
  504. }
  505. #else
  506. if (executed_app == FFT) {
  507. return 3;
  508. } else {
  509. return MAX_WORKERS_COUNT;
  510. }
  511. #endif
  512. }
  513. #ifdef ARTIFICIAL_APPS_SIM
  514. float Speedup_Artificial_App(app cur_app, int num_of_cores) {
  515. float res=0;
  516. if (num_of_cores > 0) {
  517. if (cur_app.var < 1.0) {
  518. if (num_of_cores == 1) {
  519. res = 1;
  520. } else if ((num_of_cores > 1) && (num_of_cores < cur_app.A)) {
  521. res = (num_of_cores*cur_app.A) / (cur_app.A + (cur_app.var / 2.0*(num_of_cores-1)));
  522. } else if ((num_of_cores >= cur_app.A) && (num_of_cores < 2.0*cur_app.A - 1)) {
  523. res = (num_of_cores*cur_app.A) / (cur_app.var*(cur_app.A -0.5) + num_of_cores*(1.0 - 0.5*cur_app.var));
  524. } else {
  525. res = cur_app.A;
  526. }
  527. } else { /* For n=1, result is 1*/
  528. if ((num_of_cores >= 1) && (num_of_cores <= (cur_app.A + cur_app.A*cur_app.var - cur_app.var))) {
  529. res = (num_of_cores*cur_app.A*(cur_app.var + 1)) / (cur_app.A + cur_app.var*(num_of_cores-1 + cur_app.A));
  530. } else {
  531. res = cur_app.A;
  532. }
  533. }
  534. }
  535. return res;
  536. }
  537. #endif
  538. float Speedup(app cur_app, int num_of_cores) {
  539. if ((num_of_cores < 2) || (num_of_cores > get_max_cores_count(cur_app))) {
  540. return 0;
  541. } else {
  542. #ifndef ARTIFICIAL_APPS_SIM
  543. return Exec_Speedup[num_of_cores-2];
  544. #else
  545. return Speedup_Artificial_App(cur_app, num_of_cores-1);
  546. #endif
  547. }
  548. }
  549. int get_times(app cur_app, int num_of_cores) {
  550. #ifndef ARTIFICIAL_APPS_SIM
  551. return (cur_app.workld * Exec_Latencies[num_of_cores-2]);
  552. #else
  553. return cur_app.workld * (ARTIFICIAL_ROUND_DURATION_SEC / ((int) Speedup_Artificial_App(cur_app, num_of_cores+1))); /* FIXME cutting off floating points -- +1 is because in Speedup calc it is -1*/
  554. #endif
  555. }
  556. void matrix_transpose(int n1, float *src, float *dest, int node_id, int myFirst, int myLast, int pad_length){
  557. int i;
  558. int j;
  559. int k;
  560. int l;
  561. int m;
  562. int blksize;
  563. int numblks;
  564. int firstfirst;
  565. int h_off;
  566. int v_off;
  567. int v;
  568. int h;
  569. int n1p;
  570. int row_count;
  571. //fprintf(log_file,"I am inside matrix_transpose-0 node_id is %d n1 %d\n",node_id,n1);
  572. blksize = myLast-myFirst;
  573. numblks = (2*blksize)/num_cache_lines;
  574. if (numblks * num_cache_lines != 2 * blksize) {
  575. numblks ++;
  576. }
  577. blksize = blksize / numblks;
  578. firstfirst = myFirst;
  579. row_count = n1/P;
  580. n1p = n1+pad_length;
  581. for (l=node_id+1;l<P;l++) {
  582. v_off = l*row_count;
  583. for (k=0; k<numblks; k++) {
  584. h_off = firstfirst;
  585. for (m=0; m<numblks; m++) {
  586. for (i=0; i<blksize; i++) {
  587. v = v_off + i;
  588. for (j=0; j<blksize; j++) {
  589. h = h_off + j;
  590. //fprintf(log_file,"Index dest is %d\n",2*(h*n1p+v));
  591. //fprintf(log_file,"Index src is %d\n",2*(v*n1p+h));
  592. //fprintf(log_file,"src = %f\n",src[2*(v*n1p+h)]);
  593. //fprintf(log_file,"src + 1 = %f\n",src[2*(v*n1p+h)+1]);
  594. //fprintf(log_file,"dest = %f\n",dest[2*(h*n1p+v)]);
  595. //fprintf(log_file,"dest + 1 = %f\n",dest[2*(h*n1p+v)+1]);
  596. dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
  597. dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
  598. //fprintf(log_file,"yolo\n");
  599. }
  600. }
  601. h_off += blksize;
  602. }
  603. v_off+=blksize;
  604. }
  605. }
  606. //fprintf(log_file,"I am inside matrix_transpose-A\n");
  607. for (l=0;l<node_id;l++) {
  608. v_off = l*row_count;
  609. for (k=0; k<numblks; k++) {
  610. h_off = firstfirst;
  611. for (m=0; m<numblks; m++) {
  612. for (i=0; i<blksize; i++) {
  613. v = v_off + i;
  614. for (j=0; j<blksize; j++) {
  615. h = h_off + j;
  616. dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
  617. dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
  618. }
  619. }
  620. h_off += blksize;
  621. }
  622. v_off+=blksize;
  623. }
  624. }
  625. //fprintf(log_file,"I am inside matrix_transpose-B\n");
  626. v_off = node_id*row_count;
  627. for (k=0; k<numblks; k++) {
  628. h_off = firstfirst;
  629. for (m=0; m<numblks; m++) {
  630. for (i=0; i<blksize; i++) {
  631. v = v_off + i;
  632. for (j=0; j<blksize; j++) {
  633. h = h_off + j;
  634. dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
  635. dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
  636. }
  637. }
  638. h_off += blksize;
  639. }
  640. v_off+=blksize;
  641. }
  642. //fprintf(log_file,"I am inside matrix_transpose-C\n");
  643. }
  644. //FFT1D(1, M, N, x_local, trans, upriv, umain2, work_id, lower_bound, upper_bound, pad_length, P);
  645. void FFT1D(int direction, int M, int N, float *x, float *scratch, float *upriv, float *umain2, int node_id, int myFirst, int myLast, int pad_length, int P){
  646. int j, m1, n1;
  647. //printf("I am %d and I am inside FFT1D\n",node_id);
  648. //fprintf(log_file,"I am inside FFT1D-A myFirst=%d myLast=%d\n",myFirst,myLast);
  649. m1 = M/2;
  650. n1 = 1 << m1;
  651. matrix_transpose(n1, x, scratch, node_id, myFirst, myLast, pad_length);
  652. //fprintf(log_file,"I am inside FFT1D-B\n");
  653. /* do n1 1D FFTs on columns */
  654. for (j = myFirst; j < myLast; j++){
  655. single_FFT1D(direction, m1, n1, upriv, &scratch[2*j*(n1+pad_length)]);
  656. twiddle_Col(direction, n1, N, j, umain2, &scratch[2*j*(n1+pad_length)],pad_length);
  657. }
  658. //fprintf(log_file,"I am inside FFT1D-C\n");
  659. matrix_transpose(n1, scratch, x, node_id, myFirst, myLast, pad_length);
  660. //fprintf(log_file,"I am inside FFT1D-D\n");
  661. /* do n1 1D FFTs on columns again */
  662. for (j = myFirst; j < myLast; j++) {
  663. single_FFT1D(direction, m1, n1, upriv, &x[2*j*(n1+pad_length)]);
  664. }
  665. //fprintf(log_file,"I am inside FFT1D-E\n");
  666. matrix_transpose(n1, x, scratch, node_id, myFirst, myLast, pad_length);
  667. //fprintf(log_file,"I am inside FFT1D-F\n");
  668. /*for (j = myFirst; j < myLast; j++){
  669. copyColumn(n1, &scratch[2*j*(n1+pad_length)], &x_shared[2*j*(n1+pad_length)]);
  670. }*/
  671. return;
  672. }
  673. void copyColumn(int n1, float *src, float *dest){
  674. int i;
  675. for (i = 0; i < n1; i++) {
  676. dest[2*i] = src[2*i];
  677. dest[2*i+1] = src[2*i+1];
  678. }
  679. }
  680. void single_FFT1D(int direction, int M, int N, float *u, float *x){
  681. int j, k, q, L, r, Lstar;
  682. float *u1, *x1, *x2;
  683. float omega_r, omega_c, tau_r, tau_c, x_r, x_c;
  684. reverse(N, M, x);
  685. for (q=1; q<=M; q++) {
  686. L = 1<<q; r = N/L; Lstar = L/2;
  687. u1 = &u[2*(Lstar-1)];
  688. for (k=0; k<r; k++) {
  689. x1 = &x[2*(k*L)];
  690. x2 = &x[2*(k*L+Lstar)];
  691. for (j=0; j<Lstar; j++) {
  692. omega_r = u1[2*j];
  693. omega_c = direction*u1[2*j+1];
  694. x_r = x2[2*j];
  695. x_c = x2[2*j+1];
  696. tau_r = omega_r*x_r - omega_c*x_c;
  697. tau_c = omega_r*x_c + omega_c*x_r;
  698. x_r = x1[2*j];
  699. x_c = x1[2*j+1];
  700. x2[2*j] = x_r - tau_r;
  701. x2[2*j+1] = x_c - tau_c;
  702. x1[2*j] = x_r + tau_r;
  703. x1[2*j+1] = x_c + tau_c;
  704. }
  705. }
  706. }
  707. return;
  708. }
  709. void twiddle_Col(int direction, int n1, int N, int j, float *u, float *x, int pad_length){
  710. int i;
  711. float omega_c, omega_r, x_r, x_c;
  712. for (i = 0; i < n1; i++) {
  713. omega_r = u[2*(j*(n1+pad_length)+i)];
  714. omega_c = direction*u[2*(j*(n1+pad_length)+i)+1];
  715. x_r = x[2*i];
  716. x_c = x[2*i+1];
  717. x[2*i] = omega_r*x_r - omega_c*x_c;
  718. x[2*i+1] = omega_r*x_c + omega_c*x_r;
  719. }
  720. return;
  721. }
  722. void reverse(int N, int M, float *x){
  723. int j, k;
  724. for (k = 0; k < N; k++){
  725. j = reverse_bit(M, k);
  726. if (j > k){
  727. SWAP(x[2*j], x[2*k]);
  728. SWAP(x[2*j+1], x[2*k+1]);
  729. }
  730. }
  731. return;
  732. }
  733. int reverse_bit(int M, int k){
  734. int i, j = 0, tmp = k;
  735. for (i = 0; i < M; i++){
  736. j = 2*j + (tmp&0x1);
  737. tmp = tmp >> 1;
  738. }
  739. return j;
  740. }