#include "apps.h" #include "scc_signals.h" #include "libfunctions.h" #include "my_rtrm.h" #include #include #define SWAP(a,b) {float tmp; tmp=a; a=b; b=tmp;} #define FFT_MAX 136192 #define PAGE_SIZE 4096 #define ARTIFICIAL_ROUND_DURATION_SEC 0.5 #define ARTIFICIAL_ROUND_DURATION_NSEC 500000000 /* 500 ms */ static float **svm_vectors, *svm_coef; static int *vector, **matrix; static float input_vector[D_sv]; //static float matr_speedup[NUM_OF_MATRICES][MAX_WORKERS_COUNT]; //static int matr_times[NUM_OF_MATRICES][MAX_WORKERS_COUNT]; static float Exec_Speedup[MAX_WORKERS_COUNT]; static int Exec_Latencies[MAX_WORKERS_COUNT]; //static float **vectors, *coef; //2*(N+rootN*pad_length)*sizeof(float)+PAGE_SIZE); static int P = 1; /* DEFAULT_P = 1 */ static int M = 16; /* DEFAULT_M = 10 */ static int N = 65536; /* N = 2^M */ static int rootN = 256; /* rootN = sqrt(N) */ static int num_cache_lines = 65536; #define PADLENGTH 2 static float *x_local; /* x is the original time-domain data */ static float *trans; /* trans is used as scratch space */ static float *umain; /* umain is roots of unity for 1D FFTs */ static float *umain2; /* umain2 is entire roots of unity matrix*/ static float *upriv; void execute_workload_svm (int lower_bound, int upper_bound); void execute_workload_matrix (int lower_bound, int upper_bound); void matrix_transpose(int n1, float *src, float *dest, int node_id, int myFirst, int myLast, int pad_length); void FFT1D(int direction, int M, int N, float *x, float *scratch, float *upriv, float *umain2, int node_id, int myFirst, int myLast, int pad_length, int P); void copyColumn(int n1, float *src, float *dest); void single_FFT1D(int direction, int M, int N, float *u, float *x); void twiddle_Col(int direction, int n1, int N, int j, float *u, float *x, int pad_length); void reverse(int N, int M, float *x); int reverse_bit(int M, int k); void execute_workload_svm (int lower_bound, int upper_bound) { int i = 0, j = 0; float diff = 0, norma = 0, local_sum[N_sv]; /* int vector_id = 0; Removed 16.02. Only one test vector */ if (base_offset == -1) { base_offset = cur_agent.my_agent * N_sv; //fprintf(log_file, "My agent is %d. Calculated base_offset is %d\n",cur_agent.my_agent,base_offset); } for (i = lower_bound; i <= upper_bound; i++) { local_sum[i] = 0; scc_signals_check(); for (j = 0; j < D_sv; j++){ diff = svm_vectors[i][j] - input_vector[j]; norma += diff*diff; } local_sum[i] += (float) (exp((double) (-gamma*norma))*svm_coef[i]); norma = 0; } for (i=lower_bound; i<=upper_bound; i++) manager_result_out[base_offset+i] = (int) local_sum[i]; } void execute_workload_matrix (int lower_bound, int upper_bound) { int i, j, local_sum[MAX_ARRAY_SIZE]; if (base_offset == -1) { //matrix_out = (int*) shmat (cur_agent.segment_id, NULL, 0); base_offset = cur_agent.my_agent * MAX_ARRAY_SIZE; } for (i=lower_bound; i<=upper_bound; i++) { local_sum[i] = 0; scc_signals_check(); //signals_enable(); for (j=0; j 0) { work_id = 1; } FFT1D(1, M, N, x_local, trans, upriv, umain2, work_id, lower_bound, upper_bound, pad_length, P); //HACK node_id - 1 important!! } void execute_workload_artificial (int lower_bound, int upper_bound) { int AppSpeedup = upper_bound - lower_bound; struct timespec ts; //if (base_offset == -1) { // base_offset = cur_agent.my_agent * MAX_ARRAY_SIZE; /* FIXME Why is it always MAX_ARRAY_SIZE */ //} ts.tv_sec = 0; ts.tv_nsec = ARTIFICIAL_ROUND_DURATION_NSEC / AppSpeedup; nanosleep(&ts, NULL); /* for (i=lower_bound; i<=upper_bound; i++) { sleep(ARTIFICIAL_ROUND_DURATION_SEC); } */ /* for (i=lower_bound; i<=upper_bound; i++) manager_result_out[base_offset+i] = -1; */ } void execute_workload (int lower_bound, int upper_bound) { if (executed_app == MATRIX_MUL) { execute_workload_matrix (lower_bound, upper_bound); } else if (executed_app == SVM) { execute_workload_svm (lower_bound, upper_bound); } else if (executed_app == FFT) { execute_workload_fft (lower_bound, upper_bound); } else if (executed_app == ARTIFICIAL) { execute_workload_artificial (lower_bound, upper_bound); } } void init_speedup_structs (void) { if (executed_app == MATRIX_MUL) { if (MATRIX_ARRAY_SIZE == 1024) { #ifdef PLAT_SCC Exec_Speedup[0] = 1.0; Exec_Speedup[1] = 1.188; Exec_Speedup[2] = 2.264; Exec_Speedup[3] = 3.0; Exec_Speedup[4] = 3.429; Exec_Speedup[5] = 4.0; Exec_Speedup[6] = 8.0; Exec_Speedup[7] = 0.0; Exec_Latencies[0] = 120;//29352; Exec_Latencies[1] = 101;//15112; Exec_Latencies[2] = 53;//11194; Exec_Latencies[3] = 40;//10313; Exec_Latencies[4] = 35;//8645; Exec_Latencies[5] = 30;//7871; Exec_Latencies[6] = 15;//6715; #else Exec_Speedup[0] = 1.0; Exec_Speedup[1] = 1.065; Exec_Speedup[2] = 1.270; Exec_Speedup[3] = 0.0; Exec_Speedup[4] = 0.0; Exec_Speedup[5] = 0.0; Exec_Speedup[6] = 0.0; Exec_Speedup[7] = 0.0; Exec_Latencies[0] = 100000000;//29352; Exec_Latencies[1] = 31;//15112; Exec_Latencies[2] = 29;//11194; Exec_Latencies[3] = 24;//10313; Exec_Latencies[4] = 0;//8645; Exec_Latencies[5] = 0;//7871; Exec_Latencies[6] = 0;//6715; Exec_Latencies[7] = 0;//7014; #endif } else if (MATRIX_ARRAY_SIZE == 2048) { #ifdef PLAT_SCC Exec_Speedup[0] = 1.0; Exec_Speedup[1] = 1.091; Exec_Speedup[2] = 1.2; Exec_Speedup[3] = 1.491; Exec_Speedup[4] = 1.791; Exec_Speedup[5] = 2.824; Exec_Speedup[6] = 3.0; Exec_Latencies[0] = 240;//112276; Exec_Latencies[1] = 220;//58880; Exec_Latencies[2] = 200;//40305; Exec_Latencies[3] = 161;//31705; Exec_Latencies[4] = 134;//28309; Exec_Latencies[5] = 85;//24512; Exec_Latencies[6] = 80;//22239; //matr_times[1][7] = 23;//20332; #else Exec_Speedup[0] = 1.0; Exec_Speedup[1] = 1.331; Exec_Speedup[2] = 2.009; Exec_Speedup[3] = 2.315; Exec_Speedup[4] = 2.572; Exec_Speedup[5] = 0.0; Exec_Speedup[6] = 0.0; Exec_Speedup[7] = 0.0;//5.522; Exec_Latencies[0] = 100000000;//112276; Exec_Latencies[1] = 116;//58880; Exec_Latencies[2] = 87;//40305; Exec_Latencies[3] = 58;//31705; Exec_Latencies[4] = 50;//28309; Exec_Latencies[5] = 45;//24512; Exec_Latencies[6] = 0;//22239; Exec_Latencies[7] = 0;//20332; #endif } else if (MATRIX_ARRAY_SIZE == 4096) { #ifdef PLAT_SCC Exec_Speedup[0] = 1.0; Exec_Speedup[1] = 2.001; Exec_Speedup[2] = 2.976; Exec_Speedup[3] = 4.032; Exec_Speedup[4] = 5.034; Exec_Speedup[5] = 6.25; Exec_Speedup[6] = 6.678; Exec_Speedup[7] = 6.819; Exec_Latencies[0] = 750;//384005; Exec_Latencies[1] = 374;//231583; Exec_Latencies[2] = 252;//157966; Exec_Latencies[3] = 186;//121222; Exec_Latencies[4] = 149;//101208; Exec_Latencies[5] = 120;//87852; Exec_Latencies[6] = 110;//78093; #else Exec_Speedup[0] = 1.0; Exec_Speedup[1] = 1.517; Exec_Speedup[2] = 1.958; Exec_Speedup[3] = 2.112; Exec_Speedup[4] = 2.878; Exec_Speedup[5] = 3.338; Exec_Speedup[6] = 4.241; Exec_Speedup[7] = 0.0;//5.073; Exec_Latencies[0] = 100000000;//384005; Exec_Latencies[1] = 431;//231583; Exec_Latencies[2] = 284;//157966; Exec_Latencies[3] = 220;//121222; Exec_Latencies[4] = 204;//101208; Exec_Latencies[5] = 150;//87852; Exec_Latencies[6] = 129;//78093; Exec_Latencies[7] = 102;//75690; #endif } else { printf("Unknown array size\n"); exit(0); } } else if (executed_app == SVM) { /* N_sv 4096 D_sv 4096 */ Exec_Speedup[0] = 1.0; /* 1 worker */ Exec_Speedup[1] = 1.959; Exec_Speedup[2] = 2.919; Exec_Speedup[3] = 3.853; Exec_Speedup[4] = 4.777; Exec_Speedup[5] = 5.723; Exec_Speedup[6] = 6.644; Exec_Speedup[7] = 0.0; Exec_Latencies[0] = 578; Exec_Latencies[1] = 295; Exec_Latencies[2] = 198; Exec_Latencies[3] = 150; Exec_Latencies[4] = 121; Exec_Latencies[5] = 101; Exec_Latencies[6] = 87; Exec_Latencies[7] = 6; /* Irrelevant */ } else if (executed_app == FFT) { Exec_Speedup[0] = 1.0; /* 1 worker */ Exec_Speedup[1] = 1.55; Exec_Speedup[2] = 0; Exec_Speedup[3] = 0; Exec_Speedup[4] = 0; Exec_Speedup[5] = 0; Exec_Speedup[6] = 0; Exec_Speedup[7] = 0; Exec_Latencies[0] = 772; Exec_Latencies[1] = 498; Exec_Latencies[2] = 0; Exec_Latencies[3] = 0; Exec_Latencies[4] = 0; Exec_Latencies[5] = 0; Exec_Latencies[6] = 0; Exec_Latencies[7] = 0; } if (executed_app == ARTIFICIAL) { } } void app_init (char scen_directory[SCEN_DIR_SIZE], char scen_num[SCEN_NUM_SIZE]) { int i, j, pad_length = PADLENGTH; char buf[MAX_STR_NAME_SIZE], *buffer; FILE *matrix_input, *support_vectors_file, *coef_file, *test_vector_file, *umain_file, *umain2_file, *x_local_file; size_t bufsize = 32; if (executed_app == MATRIX_MUL) { cur_agent.array_size = MATRIX_ARRAY_SIZE; matrix = (int **) malloc(cur_agent.array_size * sizeof(int *)); #ifdef PLAT_SCC strcpy(buf, "/shared/herc/"); #else strcpy(buf, "../"); #endif strcat(buf, scen_directory); strcat(buf, "/MATRIX-inputs/"); strcat(buf, itoa(cur_agent.array_size)); fprintf(log_file,"matrix file path = %s\n",buf); if ((matrix_input = fopen(buf, "r")) == NULL){ printf("Cannot open input file with file path = %s ",buf); perror("open matrix_input"); } for (i=0; i 0) { if (cur_app.var < 1.0) { if (num_of_cores == 1) { res = 1; } else if ((num_of_cores > 1) && (num_of_cores < cur_app.A)) { res = (num_of_cores*cur_app.A) / (cur_app.A + (cur_app.var / 2.0*(num_of_cores-1))); } else if ((num_of_cores >= cur_app.A) && (num_of_cores < 2.0*cur_app.A - 1)) { res = (num_of_cores*cur_app.A) / (cur_app.var*(cur_app.A -0.5) + num_of_cores*(1.0 - 0.5*cur_app.var)); } else { res = cur_app.A; } } else { /* For n=1, result is 1*/ if ((num_of_cores >= 1) && (num_of_cores <= (cur_app.A + cur_app.A*cur_app.var - cur_app.var))) { res = (num_of_cores*cur_app.A*(cur_app.var + 1)) / (cur_app.A + cur_app.var*(num_of_cores-1 + cur_app.A)); } else { res = cur_app.A; } } } return res; } #endif float Speedup(app cur_app, int num_of_cores) { if ((num_of_cores < 2) || (num_of_cores > get_max_cores_count(cur_app))) { return 0; } else { #ifndef ARTIFICIAL_APPS_SIM return Exec_Speedup[num_of_cores-2]; #else return Speedup_Artificial_App(cur_app, num_of_cores-1); #endif } } int get_times(app cur_app, int num_of_cores) { #ifndef ARTIFICIAL_APPS_SIM return (cur_app.workld * Exec_Latencies[num_of_cores-2]); #else return cur_app.workld * (ARTIFICIAL_ROUND_DURATION_SEC / ((int) Speedup_Artificial_App(cur_app, num_of_cores+1))); /* FIXME cutting off floating points -- +1 is because in Speedup calc it is -1*/ #endif } void matrix_transpose(int n1, float *src, float *dest, int node_id, int myFirst, int myLast, int pad_length){ int i; int j; int k; int l; int m; int blksize; int numblks; int firstfirst; int h_off; int v_off; int v; int h; int n1p; int row_count; //fprintf(log_file,"I am inside matrix_transpose-0 node_id is %d n1 %d\n",node_id,n1); blksize = myLast-myFirst; numblks = (2*blksize)/num_cache_lines; if (numblks * num_cache_lines != 2 * blksize) { numblks ++; } blksize = blksize / numblks; firstfirst = myFirst; row_count = n1/P; n1p = n1+pad_length; for (l=node_id+1;l k){ SWAP(x[2*j], x[2*k]); SWAP(x[2*j+1], x[2*k+1]); } } return; } int reverse_bit(int M, int k){ int i, j = 0, tmp = k; for (i = 0; i < M; i++){ j = 2*j + (tmp&0x1); tmp = tmp >> 1; } return j; }