apps.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853
  1. #include "apps.h"
  2. #include "scc_signals.h"
  3. #include "libfunctions.h"
  4. #include "my_rtrm.h"
  5. #include <time.h>
  6. #define SWAP(a,b) {float tmp; tmp=a; a=b; b=tmp;}
  7. #define FFT_MAX 136192
  8. #define PAGE_SIZE 4096
  9. #define ARTIFICIAL_ROUND_DURATION_SEC 1
  10. #define ARTIFICIAL_ROUND_DURATION_NSEC 500000000 /* 1 ms */
  11. static float **svm_vectors, *svm_coef;
  12. static int *vector, **matrix;
  13. static float input_vector[D_sv];
  14. //static float matr_speedup[NUM_OF_MATRICES][MAX_WORKERS_COUNT];
  15. //static int matr_times[NUM_OF_MATRICES][MAX_WORKERS_COUNT];
  16. static float Exec_Speedup[MAX_WORKERS_COUNT];
  17. static int Exec_Latencies[MAX_WORKERS_COUNT];
  18. //static float **vectors, *coef;
  19. //2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE);
  20. static int P = 1; /* DEFAULT_P = 1 */
  21. static int M = 16; /* DEFAULT_M = 10 */
  22. static int N = 65536; /* N = 2^M */
  23. static int rootN = 256; /* rootN = sqrt(N) */
  24. static int num_cache_lines = 65536;
  25. #define PADLENGTH 2
  26. static float *x_local; /* x is the original time-domain data */
  27. static float *trans; /* trans is used as scratch space */
  28. static float *umain; /* umain is roots of unity for 1D FFTs */
  29. static float *umain2; /* umain2 is entire roots of unity matrix*/
  30. static float *upriv;
  31. void execute_workload_svm (int lower_bound, int upper_bound);
  32. void execute_workload_matrix (int lower_bound, int upper_bound);
  33. void matrix_transpose(int n1, float *src, float *dest, int node_id, int myFirst, int myLast, int pad_length);
  34. void FFT1D(int direction, int M, int N, float *x, float *scratch, float *upriv, float *umain2, int node_id, int myFirst, int myLast, int pad_length, int P);
  35. void copyColumn(int n1, float *src, float *dest);
  36. void single_FFT1D(int direction, int M, int N, float *u, float *x);
  37. void twiddle_Col(int direction, int n1, int N, int j, float *u, float *x, int pad_length);
  38. void reverse(int N, int M, float *x);
  39. int reverse_bit(int M, int k);
  40. void execute_workload_svm (int lower_bound, int upper_bound) {
  41. int i = 0, j = 0;
  42. float diff = 0, norma = 0, local_sum[N_sv];
  43. /* int vector_id = 0; Removed 16.02. Only one test vector */
  44. if (base_offset == -1) {
  45. base_offset = cur_agent.my_agent * N_sv;
  46. //fprintf(log_file, "My agent is %d. Calculated base_offset is %d\n",cur_agent.my_agent,base_offset);
  47. }
  48. for (i = lower_bound; i <= upper_bound; i++) {
  49. local_sum[i] = 0;
  50. scc_signals_check();
  51. for (j = 0; j < D_sv; j++){
  52. diff = svm_vectors[i][j] - input_vector[j];
  53. norma += diff*diff;
  54. }
  55. local_sum[i] += (float) (exp((double) (-gamma*norma))*svm_coef[i]);
  56. norma = 0;
  57. }
  58. for (i=lower_bound; i<=upper_bound; i++)
  59. manager_result_out[base_offset+i] = (int) local_sum[i];
  60. }
  61. void execute_workload_matrix (int lower_bound, int upper_bound) {
  62. int i, j, local_sum[MAX_ARRAY_SIZE];
  63. if (base_offset == -1) {
  64. //matrix_out = (int*) shmat (cur_agent.segment_id, NULL, 0);
  65. base_offset = cur_agent.my_agent * MAX_ARRAY_SIZE;
  66. }
  67. for (i=lower_bound; i<=upper_bound; i++) {
  68. local_sum[i] = 0;
  69. scc_signals_check();
  70. //signals_enable();
  71. for (j=0; j<cur_agent.array_size; j++)
  72. local_sum[i] += matrix[i][j] * vector[j];
  73. //signals_disable();
  74. }
  75. for (i=lower_bound; i<=upper_bound; i++)
  76. manager_result_out[base_offset+i] = local_sum[i];
  77. }
  78. void execute_workload_fft (int lower_bound, int upper_bound) {
  79. int work_id = 0, pad_length = PADLENGTH;
  80. if ((lower_bound == 0) && (upper_bound == FFT_MAX)) {
  81. P = 1;
  82. } else {
  83. P = 2;
  84. }
  85. /* FIXME works only because fft is restricted to two workers */
  86. if (lower_bound > 0) {
  87. work_id = 1;
  88. }
  89. FFT1D(1, M, N, x_local, trans, upriv, umain2, work_id, lower_bound, upper_bound, pad_length, P); //HACK node_id - 1 important!!
  90. }
  91. void execute_workload_artificial (int lower_bound, int upper_bound) {
  92. int AppSpeedup = upper_bound - lower_bound;
  93. struct timespec ts;
  94. //if (base_offset == -1) {
  95. // base_offset = cur_agent.my_agent * MAX_ARRAY_SIZE; /* FIXME Why is it always MAX_ARRAY_SIZE */
  96. //}
  97. ts.tv_sec = 0;
  98. ts.tv_nsec = ARTIFICIAL_ROUND_DURATION_NSEC / AppSpeedup;
  99. nanosleep(&ts, NULL);
  100. /*
  101. for (i=lower_bound; i<=upper_bound; i++) {
  102. sleep(ARTIFICIAL_ROUND_DURATION_SEC);
  103. }
  104. */
  105. /*
  106. for (i=lower_bound; i<=upper_bound; i++)
  107. manager_result_out[base_offset+i] = -1;
  108. */
  109. }
  110. void execute_workload (int lower_bound, int upper_bound) {
  111. if (executed_app == MATRIX_MUL) {
  112. execute_workload_matrix (lower_bound, upper_bound);
  113. } else if (executed_app == SVM) {
  114. execute_workload_svm (lower_bound, upper_bound);
  115. } else if (executed_app == FFT) {
  116. execute_workload_fft (lower_bound, upper_bound);
  117. } else if (executed_app == ARTIFICIAL) {
  118. execute_workload_artificial (lower_bound, upper_bound);
  119. }
  120. }
  121. void init_speedup_structs (void) {
  122. if (executed_app == MATRIX_MUL) {
  123. if (MATRIX_ARRAY_SIZE == 1024) {
  124. #ifdef PLAT_SCC
  125. Exec_Speedup[0] = 1.0;
  126. Exec_Speedup[1] = 1.188;
  127. Exec_Speedup[2] = 2.264;
  128. Exec_Speedup[3] = 3.0;
  129. Exec_Speedup[4] = 3.429;
  130. Exec_Speedup[5] = 4.0;
  131. Exec_Speedup[6] = 8.0;
  132. Exec_Speedup[7] = 0.0;
  133. Exec_Latencies[0] = 120;//29352;
  134. Exec_Latencies[1] = 101;//15112;
  135. Exec_Latencies[2] = 53;//11194;
  136. Exec_Latencies[3] = 40;//10313;
  137. Exec_Latencies[4] = 35;//8645;
  138. Exec_Latencies[5] = 30;//7871;
  139. Exec_Latencies[6] = 15;//6715;
  140. #else
  141. Exec_Speedup[0] = 1.0;
  142. Exec_Speedup[1] = 1.065;
  143. Exec_Speedup[2] = 1.270;
  144. Exec_Speedup[3] = 0.0;
  145. Exec_Speedup[4] = 0.0;
  146. Exec_Speedup[5] = 0.0;
  147. Exec_Speedup[6] = 0.0;
  148. Exec_Speedup[7] = 0.0;
  149. Exec_Latencies[0] = 100000000;//29352;
  150. Exec_Latencies[1] = 31;//15112;
  151. Exec_Latencies[2] = 29;//11194;
  152. Exec_Latencies[3] = 24;//10313;
  153. Exec_Latencies[4] = 0;//8645;
  154. Exec_Latencies[5] = 0;//7871;
  155. Exec_Latencies[6] = 0;//6715;
  156. Exec_Latencies[7] = 0;//7014;
  157. #endif
  158. } else if (MATRIX_ARRAY_SIZE == 2048) {
  159. #ifdef PLAT_SCC
  160. Exec_Speedup[0] = 1.0;
  161. Exec_Speedup[1] = 1.091;
  162. Exec_Speedup[2] = 1.2;
  163. Exec_Speedup[3] = 1.491;
  164. Exec_Speedup[4] = 1.791;
  165. Exec_Speedup[5] = 2.824;
  166. Exec_Speedup[6] = 3.0;
  167. Exec_Latencies[0] = 240;//112276;
  168. Exec_Latencies[1] = 220;//58880;
  169. Exec_Latencies[2] = 200;//40305;
  170. Exec_Latencies[3] = 161;//31705;
  171. Exec_Latencies[4] = 134;//28309;
  172. Exec_Latencies[5] = 85;//24512;
  173. Exec_Latencies[6] = 80;//22239;
  174. //matr_times[1][7] = 23;//20332;
  175. #else
  176. Exec_Speedup[0] = 1.0;
  177. Exec_Speedup[1] = 1.331;
  178. Exec_Speedup[2] = 2.009;
  179. Exec_Speedup[3] = 2.315;
  180. Exec_Speedup[4] = 2.572;
  181. Exec_Speedup[5] = 0.0;
  182. Exec_Speedup[6] = 0.0;
  183. Exec_Speedup[7] = 0.0;//5.522;
  184. Exec_Latencies[0] = 100000000;//112276;
  185. Exec_Latencies[1] = 116;//58880;
  186. Exec_Latencies[2] = 87;//40305;
  187. Exec_Latencies[3] = 58;//31705;
  188. Exec_Latencies[4] = 50;//28309;
  189. Exec_Latencies[5] = 45;//24512;
  190. Exec_Latencies[6] = 0;//22239;
  191. Exec_Latencies[7] = 0;//20332;
  192. #endif
  193. } else if (MATRIX_ARRAY_SIZE == 4096) {
  194. #ifdef PLAT_SCC
  195. Exec_Speedup[0] = 1.0;
  196. Exec_Speedup[1] = 2.001;
  197. Exec_Speedup[2] = 2.976;
  198. Exec_Speedup[3] = 4.032;
  199. Exec_Speedup[4] = 5.034;
  200. Exec_Speedup[5] = 6.25;
  201. Exec_Speedup[6] = 6.678;
  202. Exec_Speedup[7] = 6.819;
  203. Exec_Latencies[0] = 750;//384005;
  204. Exec_Latencies[1] = 374;//231583;
  205. Exec_Latencies[2] = 252;//157966;
  206. Exec_Latencies[3] = 186;//121222;
  207. Exec_Latencies[4] = 149;//101208;
  208. Exec_Latencies[5] = 120;//87852;
  209. Exec_Latencies[6] = 110;//78093;
  210. #else
  211. Exec_Speedup[0] = 1.0;
  212. Exec_Speedup[1] = 1.517;
  213. Exec_Speedup[2] = 1.958;
  214. Exec_Speedup[3] = 2.112;
  215. Exec_Speedup[4] = 2.878;
  216. Exec_Speedup[5] = 3.338;
  217. Exec_Speedup[6] = 4.241;
  218. Exec_Speedup[7] = 0.0;//5.073;
  219. Exec_Latencies[0] = 100000000;//384005;
  220. Exec_Latencies[1] = 431;//231583;
  221. Exec_Latencies[2] = 284;//157966;
  222. Exec_Latencies[3] = 220;//121222;
  223. Exec_Latencies[4] = 204;//101208;
  224. Exec_Latencies[5] = 150;//87852;
  225. Exec_Latencies[6] = 129;//78093;
  226. Exec_Latencies[7] = 102;//75690;
  227. #endif
  228. } else {
  229. printf("Unknown array size\n");
  230. exit(0);
  231. }
  232. } else if (executed_app == SVM) {
  233. /* N_sv 4096 D_sv 4096 */
  234. Exec_Speedup[0] = 1.0; /* 1 worker */
  235. Exec_Speedup[1] = 1.959;
  236. Exec_Speedup[2] = 2.919;
  237. Exec_Speedup[3] = 3.853;
  238. Exec_Speedup[4] = 4.777;
  239. Exec_Speedup[5] = 5.723;
  240. Exec_Speedup[6] = 6.644;
  241. Exec_Speedup[7] = 0.0;
  242. Exec_Latencies[0] = 578;
  243. Exec_Latencies[1] = 295;
  244. Exec_Latencies[2] = 198;
  245. Exec_Latencies[3] = 150;
  246. Exec_Latencies[4] = 121;
  247. Exec_Latencies[5] = 101;
  248. Exec_Latencies[6] = 87;
  249. Exec_Latencies[7] = 6; /* Irrelevant */
  250. } else if (executed_app == FFT) {
  251. Exec_Speedup[0] = 1.0; /* 1 worker */
  252. Exec_Speedup[1] = 1.55;
  253. Exec_Speedup[2] = 0;
  254. Exec_Speedup[3] = 0;
  255. Exec_Speedup[4] = 0;
  256. Exec_Speedup[5] = 0;
  257. Exec_Speedup[6] = 0;
  258. Exec_Speedup[7] = 0;
  259. Exec_Latencies[0] = 772;
  260. Exec_Latencies[1] = 498;
  261. Exec_Latencies[2] = 0;
  262. Exec_Latencies[3] = 0;
  263. Exec_Latencies[4] = 0;
  264. Exec_Latencies[5] = 0;
  265. Exec_Latencies[6] = 0;
  266. Exec_Latencies[7] = 0;
  267. } if (executed_app == ARTIFICIAL) {
  268. }
  269. }
  270. void app_init (char scen_directory[SCEN_DIR_SIZE], char scen_num[SCEN_NUM_SIZE]) {
  271. int i, j, pad_length = PADLENGTH;
  272. char buf[MAX_STR_NAME_SIZE], *buffer;
  273. FILE *matrix_input, *support_vectors_file, *coef_file, *test_vector_file, *umain_file, *umain2_file, *x_local_file;
  274. size_t bufsize = 32;
  275. if (executed_app == MATRIX_MUL) {
  276. cur_agent.array_size = MATRIX_ARRAY_SIZE;
  277. matrix = (int **) malloc(cur_agent.array_size * sizeof(int *));
  278. #ifdef PLAT_SCC
  279. strcpy(buf, "/shared/herc/");
  280. #else
  281. strcpy(buf, "../");
  282. #endif
  283. strcat(buf, scen_directory);
  284. strcat(buf, "/MATRIX-inputs/");
  285. strcat(buf, itoa(cur_agent.array_size));
  286. fprintf(log_file,"matrix file path = %s\n",buf);
  287. if ((matrix_input = fopen(buf, "r")) == NULL){
  288. printf("Cannot open input file with file path = %s ",buf);
  289. perror("open matrix_input");
  290. }
  291. for (i=0; i<cur_agent.array_size; i++) {
  292. matrix[i] = (int *) malloc(cur_agent.array_size * sizeof(int));
  293. for (j=0; j<cur_agent.array_size; j++)
  294. fscanf(matrix_input,"%d",&matrix[i][j]);
  295. }
  296. vector = (int *) malloc(cur_agent.array_size * sizeof(int));
  297. for (j=0; j<cur_agent.array_size; j++)
  298. fscanf(matrix_input,"%d",&vector[j]);
  299. fclose(matrix_input);
  300. } else if (executed_app == SVM) {
  301. #ifdef PLAT_SCC
  302. strcpy(buf, "/shared/herc/");
  303. #else
  304. strcpy(buf, "../");
  305. #endif
  306. strcat(buf,scen_directory);
  307. //strcat(buf,"/");
  308. //strcat(buf,scen_num);
  309. strcat(buf,"/SVM-inputs/support_vectors_N_sv_");
  310. strcat(buf,itoa(N_sv));
  311. strcat(buf,"_D_sv_");
  312. strcat(buf,itoa(D_sv));
  313. strcat(buf,".dat");
  314. fprintf(log_file,"svm file path = %s\n",buf);
  315. if ((support_vectors_file = fopen(buf,"r")) == NULL){
  316. printf("Cannot open input file with file path = %s ",buf);
  317. perror("open svm_input");
  318. }
  319. #ifdef PLAT_SCC
  320. strcpy(buf, "/shared/herc/");
  321. #else
  322. strcpy(buf, "../");
  323. #endif
  324. strcat(buf,scen_directory);
  325. //strcat(buf,"/");
  326. //strcat(buf,scen_num);
  327. strcat(buf,"/SVM-inputs/sv_coef_N_sv_");
  328. strcat(buf,itoa(N_sv));
  329. strcat(buf,"_D_sv_");
  330. strcat(buf,itoa(D_sv));
  331. strcat(buf,".dat");
  332. fprintf(log_file,"svm_coef file path = %s\n",buf);
  333. if ((coef_file = fopen(buf,"r")) == NULL){
  334. printf("Cannot open input file with file path = %s ",buf);
  335. perror("open svm_input");
  336. }
  337. #ifdef PLAT_SCC
  338. strcpy(buf, "/shared/herc/");
  339. #else
  340. strcpy(buf, "../");
  341. #endif
  342. strcat(buf,scen_directory);
  343. //strcat(buf,"/");
  344. //strcat(buf,scen_num);
  345. strcat(buf,"/SVM-inputs/test_vector_D_sv_");
  346. strcat(buf,itoa(D_sv));
  347. strcat(buf,".dat");
  348. fprintf(log_file,"test_vector file path = %s\n",buf);
  349. if ((test_vector_file = fopen(buf,"r")) == NULL){
  350. printf("Cannot open input file with file path = %s ",buf);
  351. perror("open svm_input");
  352. }
  353. svm_vectors = (float **)malloc(N_sv*sizeof(float *));
  354. if (svm_vectors == NULL){
  355. printf("--%d-- svm_vectors malloc fail!!\n", node_id);
  356. perror("malloc error");
  357. }
  358. svm_coef = (float *)malloc(N_sv*sizeof(float));
  359. if (svm_coef == NULL){
  360. printf("--%d-- svm_coef malloc fail!!\n", node_id);
  361. perror("malloc error");
  362. }
  363. buffer = (char *)malloc(bufsize * sizeof(char));
  364. for (i = 0; i < N_sv; i++) {
  365. svm_vectors[i] = (float *)malloc(D_sv*sizeof(float));
  366. if (svm_vectors[i] == NULL) {
  367. printf("--%d-- svm_vectors[%d] malloc fail!!\n", node_id, i);
  368. perror("malloc error");
  369. } else {
  370. for (j = 0; j < D_sv; j++) {
  371. /* Read support svm_vectors */
  372. if (j < D_sv){
  373. fscanf(support_vectors_file,"%f",&svm_vectors[i][j]);
  374. fgetc(support_vectors_file);
  375. }else{
  376. getline(&buffer,&bufsize,support_vectors_file);
  377. }
  378. }
  379. }
  380. }
  381. for (j = 0; j < N_sv; j++) {
  382. /* Read coefficients */
  383. fscanf(coef_file,"%f",&svm_coef[j]);
  384. fgetc(coef_file);
  385. }
  386. for (j = 0; j < D_sv; j++) {
  387. /* Read coefficients */
  388. fscanf(test_vector_file,"%f",&input_vector[j]);
  389. }
  390. cur_agent.array_size = -1;
  391. fclose(support_vectors_file);
  392. fclose(coef_file);
  393. fclose(test_vector_file);
  394. free(buffer);
  395. } else if (executed_app == FFT) {
  396. fprintf(log_file,"Initializing FFT application\n");
  397. x_local = (float *)malloc(2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE);
  398. if (x_local == NULL){
  399. printf("Malloc error for x_local\n");
  400. perror("malloc error");
  401. exit(-1);
  402. }
  403. trans = (float *)malloc(2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE);
  404. if (trans == NULL){
  405. printf("Malloc error for trans\n");
  406. perror("malloc error");
  407. exit(-1);
  408. }
  409. umain = (float *)malloc(2*rootN*sizeof(float));
  410. if (umain == NULL){
  411. printf("Malloc error for umain\n");
  412. perror("malloc error");
  413. exit(-1);
  414. }
  415. umain2 = (float *)malloc(2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE);
  416. if (umain2 == NULL){
  417. printf("Malloc error for umain2\n");
  418. perror("malloc error");
  419. exit(-1);
  420. }
  421. upriv = (float *)malloc(2*(rootN-1)*sizeof(float));
  422. if (upriv == NULL){
  423. printf("--%d-- Malloc error for upriv\n", node_id);
  424. perror("malloc error");
  425. exit(-1);
  426. }
  427. #ifdef PLAT_SCC
  428. strcpy(buf, "/shared/herc/");
  429. #else
  430. strcpy(buf, "../");
  431. #endif
  432. strcat(buf,scen_directory);
  433. //strcat(buf,"/");
  434. //strcat(buf,scen_num);
  435. strcat(buf,"/FFT-inputs/umain_file");
  436. fprintf(log_file,"umain_file file path = %s\n",buf);
  437. if ((umain_file = fopen(buf,"r")) == NULL){
  438. printf("Cannot open input file with file path = %s ",buf);
  439. perror("open fft_input");
  440. }
  441. for (i=0; i<2*rootN; i++) {
  442. fscanf(umain_file,"%f",&umain[i]);
  443. }
  444. fclose(umain_file);
  445. #ifdef PLAT_SCC
  446. strcpy(buf, "/shared/herc/");
  447. #else
  448. strcpy(buf, "../");
  449. #endif
  450. strcat(buf,scen_directory);
  451. //strcat(buf,"/");
  452. //strcat(buf,scen_num);
  453. strcat(buf,"/FFT-inputs/umain2_file");
  454. fprintf(log_file,"umain2_file file path = %s\n",buf);
  455. if ((umain2_file = fopen(buf,"r")) == NULL){
  456. printf("Cannot open input file with file path = %s ",buf);
  457. perror("open umain_file");
  458. }
  459. //for (i=0; i<2*(N+rootN*pad_length)+PAGE_SIZE; i++) {
  460. for (i=0; i<2*(N+rootN*pad_length); i++) {
  461. fscanf(umain2_file,"%f",&umain2[i]);
  462. }
  463. fclose(umain2_file);
  464. #ifdef PLAT_SCC
  465. strcpy(buf, "/shared/herc/");
  466. #else
  467. strcpy(buf, "../");
  468. #endif
  469. strcat(buf,scen_directory);
  470. //strcat(buf,"/");
  471. //strcat(buf,scen_num);
  472. strcat(buf,"/FFT-inputs/x_local_file");
  473. fprintf(log_file,"x_local_file file path = %s\n",buf);
  474. if ((x_local_file = fopen(buf,"r")) == NULL){
  475. printf("Cannot open input file with file path = %s ",buf);
  476. perror("open x_local_file");
  477. }
  478. //for (i=0;i<2*(N+rootN*pad_length)+PAGE_SIZE;i++) {
  479. for (i=0; i<2*(N+rootN*pad_length); i++) {
  480. fscanf(x_local_file,"%f",&x_local[i]);
  481. }
  482. fclose(x_local_file);
  483. for (i = 0; i < 2*(rootN-1); i++){
  484. upriv[i] = umain[i];
  485. }
  486. } else if (executed_app == MATRIX_MUL) {
  487. }
  488. }
  489. int get_max_cores_count(app cur_app){
  490. /*if (cur_app.var < 1.0)
  491. return (int) ceilf(2.0*cur_app.A - 1);
  492. else
  493. return (int) ceilf(cur_app.A + cur_app.A*cur_app.var - cur_app.var);*/
  494. #ifdef SINGLE_WORKER
  495. return 2;
  496. #else
  497. if (executed_app == FFT) {
  498. return 3;
  499. } else {
  500. return MAX_WORKERS_COUNT;
  501. }
  502. #endif
  503. }
  504. float Speedup_Artificial_App(app cur_app, int num_of_cores) {
  505. float res=0;
  506. if (num_of_cores > 0) {
  507. if (cur_app.var < 1.0) {
  508. if (num_of_cores == 1) {
  509. res = 1;
  510. } else if ((num_of_cores > 1) && (num_of_cores < cur_app.A)) {
  511. res = (num_of_cores*cur_app.A) / (cur_app.A + (cur_app.var / 2.0*(num_of_cores-1)));
  512. } else if ((num_of_cores >= cur_app.A) && (num_of_cores < 2.0*cur_app.A - 1)) {
  513. res = (num_of_cores*cur_app.A) / (cur_app.var*(cur_app.A -0.5) + num_of_cores*(1.0 - 0.5*cur_app.var));
  514. } else {
  515. res = cur_app.A;
  516. }
  517. } else {
  518. if ((num_of_cores >= 1) && (num_of_cores <= (cur_app.A + cur_app.A*cur_app.var - cur_app.var))) {
  519. res = (num_of_cores*cur_app.A*(cur_app.var + 1)) / (cur_app.A + cur_app.var*(num_of_cores-1 + cur_app.A));
  520. } else {
  521. res = cur_app.A;
  522. }
  523. }
  524. }
  525. return res;
  526. }
  527. float Speedup(app cur_app, int num_of_cores) {
  528. if ((num_of_cores < 2) || (num_of_cores > get_max_cores_count(cur_app))) {
  529. return 0;
  530. } else {
  531. #ifndef ARTIFICIAL_APPS_SIM
  532. return Exec_Speedup[num_of_cores-2];
  533. #else
  534. return Speedup_Artificial_App(cur_app, num_of_cores-1);
  535. #endif
  536. }
  537. }
  538. int get_times(app cur_app, int num_of_cores) {
  539. /*
  540. int type;
  541. if (cur_app.array_size == 1024) type = 0;
  542. else if (cur_app.array_size == 2048) type = 1;
  543. else if (cur_app.array_size == 4096) type = 2;
  544. else {
  545. fprintf(log_file, "Unknown array size = %d\n",cur_app.array_size);
  546. fflush(log_file);
  547. return 0.0;
  548. }
  549. return (cur_app.workld * matr_times[type][num_of_cores-2]);
  550. */
  551. return (cur_app.workld * Exec_Latencies[num_of_cores-2]);
  552. }
  553. void matrix_transpose(int n1, float *src, float *dest, int node_id, int myFirst, int myLast, int pad_length){
  554. int i;
  555. int j;
  556. int k;
  557. int l;
  558. int m;
  559. int blksize;
  560. int numblks;
  561. int firstfirst;
  562. int h_off;
  563. int v_off;
  564. int v;
  565. int h;
  566. int n1p;
  567. int row_count;
  568. //fprintf(log_file,"I am inside matrix_transpose-0 node_id is %d n1 %d\n",node_id,n1);
  569. blksize = myLast-myFirst;
  570. numblks = (2*blksize)/num_cache_lines;
  571. if (numblks * num_cache_lines != 2 * blksize) {
  572. numblks ++;
  573. }
  574. blksize = blksize / numblks;
  575. firstfirst = myFirst;
  576. row_count = n1/P;
  577. n1p = n1+pad_length;
  578. for (l=node_id+1;l<P;l++) {
  579. v_off = l*row_count;
  580. for (k=0; k<numblks; k++) {
  581. h_off = firstfirst;
  582. for (m=0; m<numblks; m++) {
  583. for (i=0; i<blksize; i++) {
  584. v = v_off + i;
  585. for (j=0; j<blksize; j++) {
  586. h = h_off + j;
  587. //fprintf(log_file,"Index dest is %d\n",2*(h*n1p+v));
  588. //fprintf(log_file,"Index src is %d\n",2*(v*n1p+h));
  589. //fprintf(log_file,"src = %f\n",src[2*(v*n1p+h)]);
  590. //fprintf(log_file,"src + 1 = %f\n",src[2*(v*n1p+h)+1]);
  591. //fprintf(log_file,"dest = %f\n",dest[2*(h*n1p+v)]);
  592. //fprintf(log_file,"dest + 1 = %f\n",dest[2*(h*n1p+v)+1]);
  593. //fflush(log_file);
  594. dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
  595. dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
  596. //fprintf(log_file,"yolo\n");
  597. }
  598. }
  599. h_off += blksize;
  600. }
  601. v_off+=blksize;
  602. }
  603. }
  604. //fprintf(log_file,"I am inside matrix_transpose-A\n");
  605. for (l=0;l<node_id;l++) {
  606. v_off = l*row_count;
  607. for (k=0; k<numblks; k++) {
  608. h_off = firstfirst;
  609. for (m=0; m<numblks; m++) {
  610. for (i=0; i<blksize; i++) {
  611. v = v_off + i;
  612. for (j=0; j<blksize; j++) {
  613. h = h_off + j;
  614. dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
  615. dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
  616. }
  617. }
  618. h_off += blksize;
  619. }
  620. v_off+=blksize;
  621. }
  622. }
  623. //fprintf(log_file,"I am inside matrix_transpose-B\n");
  624. v_off = node_id*row_count;
  625. for (k=0; k<numblks; k++) {
  626. h_off = firstfirst;
  627. for (m=0; m<numblks; m++) {
  628. for (i=0; i<blksize; i++) {
  629. v = v_off + i;
  630. for (j=0; j<blksize; j++) {
  631. h = h_off + j;
  632. dest[2*(h*n1p+v)] = src[2*(v*n1p+h)];
  633. dest[2*(h*n1p+v)+1] = src[2*(v*n1p+h)+1];
  634. }
  635. }
  636. h_off += blksize;
  637. }
  638. v_off+=blksize;
  639. }
  640. //fprintf(log_file,"I am inside matrix_transpose-C\n");
  641. }
  642. //FFT1D(1, M, N, x_local, trans, upriv, umain2, work_id, lower_bound, upper_bound, pad_length, P);
  643. void FFT1D(int direction, int M, int N, float *x, float *scratch, float *upriv, float *umain2, int node_id, int myFirst, int myLast, int pad_length, int P){
  644. int j, m1, n1;
  645. //printf("I am %d and I am inside FFT1D\n",node_id);
  646. //fprintf(log_file,"I am inside FFT1D-A myFirst=%d myLast=%d\n",myFirst,myLast);
  647. m1 = M/2;
  648. n1 = 1 << m1;
  649. matrix_transpose(n1, x, scratch, node_id, myFirst, myLast, pad_length);
  650. //fprintf(log_file,"I am inside FFT1D-B\n");
  651. /* do n1 1D FFTs on columns */
  652. for (j = myFirst; j < myLast; j++){
  653. single_FFT1D(direction, m1, n1, upriv, &scratch[2*j*(n1+pad_length)]);
  654. twiddle_Col(direction, n1, N, j, umain2, &scratch[2*j*(n1+pad_length)],pad_length);
  655. }
  656. //fprintf(log_file,"I am inside FFT1D-C\n");
  657. matrix_transpose(n1, scratch, x, node_id, myFirst, myLast, pad_length);
  658. //fprintf(log_file,"I am inside FFT1D-D\n");
  659. /* do n1 1D FFTs on columns again */
  660. for (j = myFirst; j < myLast; j++) {
  661. single_FFT1D(direction, m1, n1, upriv, &x[2*j*(n1+pad_length)]);
  662. }
  663. //fprintf(log_file,"I am inside FFT1D-E\n");
  664. matrix_transpose(n1, x, scratch, node_id, myFirst, myLast, pad_length);
  665. //fprintf(log_file,"I am inside FFT1D-F\n");
  666. /*for (j = myFirst; j < myLast; j++){
  667. copyColumn(n1, &scratch[2*j*(n1+pad_length)], &x_shared[2*j*(n1+pad_length)]);
  668. }*/
  669. return;
  670. }
  671. void copyColumn(int n1, float *src, float *dest){
  672. int i;
  673. for (i = 0; i < n1; i++) {
  674. dest[2*i] = src[2*i];
  675. dest[2*i+1] = src[2*i+1];
  676. }
  677. }
  678. void single_FFT1D(int direction, int M, int N, float *u, float *x){
  679. int j, k, q, L, r, Lstar;
  680. float *u1, *x1, *x2;
  681. float omega_r, omega_c, tau_r, tau_c, x_r, x_c;
  682. reverse(N, M, x);
  683. for (q=1; q<=M; q++) {
  684. L = 1<<q; r = N/L; Lstar = L/2;
  685. u1 = &u[2*(Lstar-1)];
  686. for (k=0; k<r; k++) {
  687. x1 = &x[2*(k*L)];
  688. x2 = &x[2*(k*L+Lstar)];
  689. for (j=0; j<Lstar; j++) {
  690. omega_r = u1[2*j];
  691. omega_c = direction*u1[2*j+1];
  692. x_r = x2[2*j];
  693. x_c = x2[2*j+1];
  694. tau_r = omega_r*x_r - omega_c*x_c;
  695. tau_c = omega_r*x_c + omega_c*x_r;
  696. x_r = x1[2*j];
  697. x_c = x1[2*j+1];
  698. x2[2*j] = x_r - tau_r;
  699. x2[2*j+1] = x_c - tau_c;
  700. x1[2*j] = x_r + tau_r;
  701. x1[2*j+1] = x_c + tau_c;
  702. }
  703. }
  704. }
  705. return;
  706. }
  707. void twiddle_Col(int direction, int n1, int N, int j, float *u, float *x, int pad_length){
  708. int i;
  709. float omega_c, omega_r, x_r, x_c;
  710. for (i = 0; i < n1; i++) {
  711. omega_r = u[2*(j*(n1+pad_length)+i)];
  712. omega_c = direction*u[2*(j*(n1+pad_length)+i)+1];
  713. x_r = x[2*i];
  714. x_c = x[2*i+1];
  715. x[2*i] = omega_r*x_r - omega_c*x_c;
  716. x[2*i+1] = omega_r*x_c + omega_c*x_r;
  717. }
  718. return;
  719. }
  720. void reverse(int N, int M, float *x){
  721. int j, k;
  722. for (k = 0; k < N; k++){
  723. j = reverse_bit(M, k);
  724. if (j > k){
  725. SWAP(x[2*j], x[2*k]);
  726. SWAP(x[2*j+1], x[2*k+1]);
  727. }
  728. }
  729. return;
  730. }
  731. int reverse_bit(int M, int k){
  732. int i, j = 0, tmp = k;
  733. for (i = 0; i < M; i++){
  734. j = 2*j + (tmp&0x1);
  735. tmp = tmp >> 1;
  736. }
  737. return j;
  738. }