RCCE_eco_q.c.svn-base 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. /* this synthetic application assumes a three-dimensional
  2. domain of nx*ny*nz points that is decomposed into chunks
  3. of different size, and that require different amounts
  4. of computational work.
  5. */
  6. #include "RCCE.h"
  7. #include "RCCE_pwr_wq.h"
  8. #include <stdio.h>
  9. #define min(x,y) ( (x) < (y) ? (x) : (y) )
  10. #define max(x,y) ( (x) > (y) ? (x) : (y) )
  11. int power_change = 1;
  12. int BASE_F = 5; /* baseline clock divider (320 MHz) */
  13. int HIGH_F = 3; /* high CPU clock divider (533 MHz) */
  14. void read_and_prep_data(int, int, int, int, double*);
  15. void do_work(int, int, int, int, int, int, int,
  16. double*, double*, double*, double*, RCCE_REQUEST*);
  17. #define NX 200
  18. #define NY 200
  19. #define NZ 100
  20. #define NCOMP 5
  21. #define NITER 10
  22. #define XZONEJAGS 4
  23. #define YZONEJAGS 4
  24. #define STEP 3
  25. typedef struct {
  26. struct {
  27. int seq_number;
  28. } dynamic_part;
  29. int npx;
  30. int npy;
  31. int kstart;
  32. int kend;
  33. int kwidth;
  34. int left;
  35. int right;
  36. int *isize;
  37. int *jsize;
  38. int *ksize;
  39. RCCE_REQUEST *request;
  40. } WORK_ITEM;
  41. int RCCE_WI_size(void *work_item) {
  42. return(sizeof(((WORK_ITEM *)work_item)->dynamic_part));
  43. }
  44. int RCCE_WI_valid(void *work_item) {
  45. return(((WORK_ITEM *)work_item)->dynamic_part.seq_number>=0);
  46. }
  47. void *RCCE_WI_address(void *work_item) {
  48. return((void *)(&(((WORK_ITEM *)work_item)->dynamic_part)));
  49. }
  50. int RCCE_APP(int argc, char **argv){
  51. int *isize, *jsize, *ksize;
  52. int ID, NP;
  53. int npx, npy, ix, iy, kstart, kend, kwidth, nrounds;
  54. int i, j, k, mem, ue, iter, fdiv, vlevel;
  55. int *team_member, team_size, team_lead, size, local_rank,
  56. left, right, master, master_number, *master_list;
  57. QUEUE_PARMS wq_pars;
  58. WORK_ITEM work_item;
  59. RCCE_REQUEST request;
  60. RCCE_init(&argc, &argv);
  61. NP = wq_pars.NP = RCCE_num_ues();
  62. ID = wq_pars.ID = RCCE_ue();
  63. if (argc < 4) {
  64. if (ID==0) printf("Error: Need two parameters, x & y tiles, plus # rounds\n");
  65. return(1);
  66. }
  67. /* read the number of subdomains (x & y-direction) from the command line */
  68. npx = work_item.npx = atoi(*++argv);
  69. npy = work_item.npy = atoi(*++argv);
  70. /* test validity of the requested tiling; each tile must be large enough to
  71. divide the z-dimension amoung the members of the team */
  72. if (npx <= 0 || npy <= 0 || npx > NX || npy > NY) {
  73. if (ID==0) printf("Illegal tiling: %d, %d\n", npx, npy);
  74. RCCE_finalize();
  75. return(1);
  76. }
  77. nrounds = atoi(*++argv);
  78. if (nrounds <= 0) {power_change=0; nrounds = -nrounds;}
  79. RCCE_debug_set(RCCE_DEBUG_ALL);
  80. /* lower power req until we need it */
  81. if (power_change) RCCE_iset_power(BASE_F, &request, &fdiv, &vlevel);
  82. /* form teams; copy results to local variables */
  83. RCCE_setup_work_queue_teams(&wq_pars);
  84. master = wq_pars.master;
  85. team_lead = wq_pars.team_lead;
  86. local_rank = wq_pars.local_rank;
  87. team_size = wq_pars.team_size;
  88. team_member = wq_pars.team_member;
  89. master_list = wq_pars.master_list;
  90. if (team_size > NZ) {
  91. if (ID==0) printf("Error: NZ too small: %d\n", NZ);
  92. RCCE_finalize();
  93. return(1);
  94. }
  95. /* define left and right neighbors */
  96. if (local_rank>0) work_item.left = team_member[local_rank-1];
  97. else work_item.left = -1;
  98. if (local_rank<team_size-1) work_item.right = team_member[local_rank+1];
  99. else work_item.right = -1;
  100. if (ID != master) {
  101. /* allocate space for the sizes of the subdomains */
  102. isize = (int *) malloc(sizeof(int)*npx);
  103. jsize = (int *) malloc(sizeof(int)*npy);
  104. ksize = (int *) malloc(sizeof(int)*team_size);
  105. if (!isize || !jsize || !ksize) {
  106. printf("Could not allocate space for tile sizes\n");
  107. return(1);
  108. }
  109. for (k=0; k<team_size; k++) {
  110. ksize[k] = NZ/team_size;
  111. /* adjust for any leftover points */
  112. if (k<(NZ%team_size)) ksize[k]++;
  113. }
  114. for (kstart=0, k=0; k<local_rank; k++) kstart += ksize[k];
  115. kend = kstart + ksize[local_rank] -1;
  116. kwidth = work_item.kwidth = ksize[local_rank]+2;
  117. work_item.kstart = kstart;
  118. work_item.kend = kend;
  119. /* introduce load imbalance among subdomains by perturbing their sizes */
  120. for (i=0; i<npx-1; i++) isize[i] = NX/npx;
  121. isize[npx-1] = NX-(NX/npx)*(npx-1);
  122. for (iter=0; iter<XZONEJAGS; iter++)
  123. for (i=1; i<npx; i+=2) if (isize[i-1] > i) {
  124. isize[i-1] -= i;
  125. isize[i] += i;
  126. }
  127. for (j=0; j<npy-1; j++) jsize[j] = NY/npy;
  128. jsize[npy-1] = NY-(NY/npy)*(npy-1);
  129. for (iter=0; iter<YZONEJAGS; iter++)
  130. for (j=1; j<npy; j+=2) if (jsize[j-1] > j) {
  131. jsize[j-1] -= j;
  132. jsize[j] += j;
  133. }
  134. }
  135. work_item.dynamic_part.seq_number = 0;
  136. work_item.request = &request;
  137. work_item.isize = isize;
  138. work_item.jsize = jsize;
  139. work_item.ksize = ksize;
  140. WORK_ITEM *wi = &work_item;
  141. /* master goes into a loop, servicing work requests */
  142. if (ID==master) {
  143. int tasks_completed = 0;
  144. while (tasks_completed<nrounds) {
  145. tasks_completed += RCCE_queue_master_loop((void *)&work_item, &wq_pars);
  146. }
  147. /* master creates one more work loop to end all teams */
  148. work_item.dynamic_part.seq_number = -1;
  149. RCCE_queue_master_loop((void *)&work_item, &wq_pars);
  150. }
  151. /* teams go into an endless loop, executing tasks and asking for new
  152. ones when they are done */
  153. else {
  154. int error = 0;
  155. while (!error) {
  156. error=RCCE_queue_member_loop((void *)(&work_item), &wq_pars);
  157. }
  158. }
  159. RCCE_finalize();
  160. return (0);
  161. }
  162. int RCCE_execute_work_item(void *work_item, QUEUE_PARMS *wq_pars) {
  163. int ix, iy, words, fdiv, vlevel;
  164. double *data_frame, *flux_x, *flux_y, *flux_z;
  165. WORK_ITEM *wi;
  166. wi = (WORK_ITEM *)work_item;
  167. ix = (wi->dynamic_part.seq_number)%(wi->npx);
  168. iy = (wi->dynamic_part.seq_number)/(wi->npx);
  169. words = wi->isize[ix]*wi->jsize[iy]*(wi->kwidth)*NCOMP;
  170. data_frame = (double *) malloc(4*words*sizeof(double));
  171. if (!data_frame) {
  172. printf("Could not allocate %d words on UE %d\n", words, RCCE_ue());
  173. return(1);
  174. }
  175. flux_x = data_frame + 1*words;
  176. flux_y = data_frame + 2*words;
  177. flux_z = data_frame + 3*words;
  178. read_and_prep_data(wi->isize[ix], wi->jsize[iy], wi->kstart, wi->kend, data_frame);
  179. /* entering a high-cpu-intensity segment of the code */
  180. if (power_change) RCCE_wait_power(wi->request);
  181. if (power_change) RCCE_iset_power(HIGH_F, wi->request, &fdiv, &vlevel);
  182. do_work(wi->isize[ix], wi->jsize[iy], wi->kstart, wi->kend, wi->left, wi->right,
  183. wq_pars->local_rank, data_frame, flux_x, flux_y, flux_z, wi->request);
  184. free(data_frame);
  185. return(0);
  186. }
  187. #define FR(c,i,j,k) data_frame[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
  188. void read_and_prep_data(int in, int jn, int kstart, int kend, double *data_frame) {
  189. int i, j, k, c;
  190. /* initialize with smooth data */
  191. for (k=kstart; k<=kend; k++) for (j=0; j<jn; j++) for (i=0; i<in; i++) {
  192. FR(0,i,j,k) = 1.0;
  193. FR(1,i,j,k) = (double)(k-j)+10.0;
  194. FR(2,i,j,k) = (double)(i-k)+20.0;
  195. FR(3,i,j,k) = (double)(j-i)+30.0;
  196. FR(4,i,j,k) = 100.0;
  197. }
  198. /* add jaggedness */
  199. for (k=kstart; k<=kend; k++) {
  200. for (j=0; j<jn; j+=2) {
  201. for (i=0; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) -= 1.0;
  202. for (i=1; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) += 1.0;
  203. }
  204. for (j=1; j<jn; j+=2) {
  205. for (i=0; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) -= 1.0;
  206. for (i=1; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) += 1.0;
  207. }
  208. }
  209. return;
  210. }
  211. #define FLUX_X(c,i,j,k) flux_x[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
  212. #define FLUX_Y(c,i,j,k) flux_y[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
  213. #define FLUX_Z(c,i,j,k) flux_z[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
  214. void do_work(int in, int jn, int kstart, int kend, int left, int right, int rank,
  215. double *data_frame, double *flux_x, double *flux_y, double *flux_z,
  216. RCCE_REQUEST *request) {
  217. int i, j, k, c, iter, phase, fdiv, vlevel;
  218. double vx = 1.0, vy = 1.0, vz = 1.0;
  219. double dt = 0.0001;
  220. double mu = 1.0;
  221. for (iter=0; iter<NITER; iter++) {
  222. if (iter==2 && power_change) {
  223. RCCE_wait_power(request);
  224. }
  225. if (iter==NITER-2 & power_change) {
  226. RCCE_iset_power(BASE_F, request, &fdiv, &vlevel);
  227. }
  228. /* before each iteration we need to fill ghost points with neighbor data */
  229. for (phase=0; phase<2; phase++) {
  230. if (right != -1 && (rank+phase+1)%2) {
  231. RCCE_send((char *)(&FR(0,0,0,kend)),in*jn*NCOMP*sizeof(double), right);
  232. }
  233. if (left != -1 && (rank+phase)%2) {
  234. RCCE_recv((char *)(&FR(0,0,0,kstart-1)),in*jn*NCOMP*sizeof(double), left);
  235. }
  236. }
  237. for (phase=0; phase<2; phase++) {
  238. if (left != -1 && (rank+phase+1)%2)
  239. RCCE_send((char *)(&FR(0,0,0,kstart)),in*jn*NCOMP*sizeof(double), left);
  240. if (right != -1 && (rank+phase)%2)
  241. RCCE_recv((char *)(&FR(0,0,0,kend+1)),in*jn*NCOMP*sizeof(double), right);
  242. }
  243. for (k=max(kstart,1); k<=min(NZ-2,kend); k++) for (j=1; j<jn-1; j++)
  244. for (i=1; i<in-1; i++)
  245. for (c=0; c<NCOMP; c++){
  246. FLUX_X(c,i,j,k) =
  247. (3.0*FR(c,i+1,j+1,k ) - 4.0*FR(c,i,j+1,k ) + FR(c,i-1,j+1,k ))/16.0 +
  248. (3.0*FR(c,i+1,j ,k+1) - 4.0*FR(c,i,j, k+1) + FR(c,i-1,j, k+1))/16.0 +
  249. (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i,j+1,k+1) + FR(c,i-1,j+1,k+1))/32.0 +
  250. (3.0*FR(c,i+1,j-1,k ) - 4.0*FR(c,i,j-1,k ) + FR(c,i-1,j-1,k ))/16.0 +
  251. (3.0*FR(c,i+1,j ,k-1) - 4.0*FR(c,i,j, k-1) + FR(c,i-1,j, k-1))/16.0 +
  252. (3.0*FR(c,i+1,j-1,k-1) - 4.0*FR(c,i,j-1,k-1) + FR(c,i-1,j-1,k-1))/32.0 +
  253. (3.0*FR(c,i+1,j-1,k+1) - 4.0*FR(c,i,j-1,k+1) + FR(c,i-1,j-1,k+1))/32.0 +
  254. (3.0*FR(c,i+1,j+1,k-1) - 4.0*FR(c,i,j+1,k-1) + FR(c,i-1,j+1,k-1))/32.0 +
  255. (3.0*FR(c,i+1,j ,k ) - 4.0*FR(c,i,j, k ) + FR(c,i-1,j, k ))/8.0;
  256. FLUX_Y(c,i,j,k) =
  257. (3.0*FR(c,i+1,j+1,k ) - 4.0*FR(c,i+1,j,k ) + FR(c,i+1,j-1,k ))/16.0 +
  258. (3.0*FR(c,i ,j+1,k+1) - 4.0*FR(c,i ,j,k+1) + FR(c,i ,j-1,k+1))/16.0 +
  259. (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i+1,j,k+1) + FR(c,i+1,j-1,k+1))/32.0 +
  260. (3.0*FR(c,i-1,j+1,k ) - 4.0*FR(c,i-1,j,k ) + FR(c,i-1,j-1,k ))/16.0 +
  261. (3.0*FR(c,i ,j+1,k-1) - 4.0*FR(c,i ,j,k-1) + FR(c,i ,j-1,k-1))/16.0 +
  262. (3.0*FR(c,i-1,j+1,k-1) - 4.0*FR(c,i-1,j,k-1) + FR(c,i-1,j-1,k-1))/32.0 +
  263. (3.0*FR(c,i-1,j+1,k+1) - 4.0*FR(c,i-1,j,k+1) + FR(c,i-1,j-1,k+1))/32.0 +
  264. (3.0*FR(c,i+1,j+1,k-1) - 4.0*FR(c,i+1,j,k-1) + FR(c,i+1,j-1,k-1))/32.0 +
  265. (3.0*FR(c,i ,j+1,k ) - 4.0*FR(c,i ,j,k ) + FR(c,i ,j-1,k ))/8.0;
  266. FLUX_Y(c,i,j,k) =
  267. (3.0*FR(c,i+1,j ,k+1) - 4.0*FR(c,i+1,j ,k) + FR(c,i+1,j ,k-1))/16.0 +
  268. (3.0*FR(c,i ,j+1,k+1) - 4.0*FR(c,i ,j+1,k) + FR(c,i ,j+1,k-1))/16.0 +
  269. (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i+1,j+1,k) + FR(c,i+1,j+1,k-1))/32.0 +
  270. (3.0*FR(c,i-1,j ,k+1) - 4.0*FR(c,i-1,j ,k) + FR(c,i-1,j ,k-1))/16.0 +
  271. (3.0*FR(c,i ,j-1,k+1) - 4.0*FR(c,i ,j-1,k) + FR(c,i ,j-1,k-1))/16.0 +
  272. (3.0*FR(c,i-1,j-1,k+1) - 4.0*FR(c,i-1,j-1,k) + FR(c,i-1,j-1,k-1))/32.0 +
  273. (3.0*FR(c,i-1,j+1,k+1) - 4.0*FR(c,i-1,j+1,k) + FR(c,i-1,j+1,k-1))/32.0 +
  274. (3.0*FR(c,i+1,j-1,k+1) - 4.0*FR(c,i+1,j-1,k) + FR(c,i+1,j-1,k-1))/32.0 +
  275. (3.0*FR(c,i ,j ,k+1) - 4.0*FR(c,i ,j ,k) + FR(c,i ,j ,k-1))/8.0;
  276. FR(c,i,j,k) += dt*(
  277. -1.0*(vx*FLUX_X(c,i,j,k) + vy*FLUX_Y(c,i,j,k) + vz*FLUX_Z(c,i,j,k)) +
  278. FR(c,i+1,j,k) -2.0*FR(c,i,j,k) + FR(c,i-1,j,k) +
  279. FR(c,i,j+1,k) -2.0*FR(c,i,j,k) + FR(c,i,j-1,k) +
  280. FR(c,i,j,k+1) -2.0*FR(c,i,j,k) + FR(c,i,j,k-1));
  281. }
  282. }
  283. return;
  284. }
  285. int RCCE_new_work_item(void *work_item, QUEUE_PARMS *wq_pars) {
  286. WORK_ITEM *wi = (WORK_ITEM *)work_item;
  287. wi->dynamic_part.seq_number = (wi->dynamic_part.seq_number+1)%(wi->npx*wi->npy);
  288. return(RCCE_SUCCESS);
  289. }