Parcourir la source

Added SCC libraries

Bill Tsou il y a 6 ans
Parent
commit
d80b6a89a3
100 fichiers modifiés avec 13161 ajouts et 1 suppressions
  1. 1 1
      Makefile
  2. 47 0
      RCCE_V2.0/.svn/all-wcprops
  3. 290 0
      RCCE_V2.0/.svn/entries
  4. 5 0
      RCCE_V2.0/.svn/prop-base/build_stress_test.svn-base
  5. 5 0
      RCCE_V2.0/.svn/prop-base/configure.svn-base
  6. 5 0
      RCCE_V2.0/.svn/prop-base/run_stress_test.svn-base
  7. 5 0
      RCCE_V2.0/.svn/prop-base/sourcing.svn-base
  8. 15 0
      RCCE_V2.0/.svn/text-base/COPYING.svn-base
  9. 116 0
      RCCE_V2.0/.svn/text-base/Makefile.svn-base
  10. 153 0
      RCCE_V2.0/.svn/text-base/README.svn-base
  11. 79 0
      RCCE_V2.0/.svn/text-base/build_stress_test.svn-base
  12. 91 0
      RCCE_V2.0/.svn/text-base/configure.svn-base
  13. 127 0
      RCCE_V2.0/.svn/text-base/run_stress_test.svn-base
  14. 2 0
      RCCE_V2.0/.svn/text-base/sourcing.svn-base
  15. 15 0
      RCCE_V2.0/COPYING
  16. 116 0
      RCCE_V2.0/Makefile
  17. 153 0
      RCCE_V2.0/README
  18. 17 0
      RCCE_V2.0/apps/.svn/all-wcprops
  19. 123 0
      RCCE_V2.0/apps/.svn/entries
  20. 5 0
      RCCE_V2.0/apps/.svn/prop-base/hpl.svn-base
  21. 1 0
      RCCE_V2.0/apps/.svn/text-base/README.svn-base
  22. 1 0
      RCCE_V2.0/apps/.svn/text-base/hpl.svn-base
  23. 35 0
      RCCE_V2.0/apps/ECOQ/.svn/all-wcprops
  24. 198 0
      RCCE_V2.0/apps/ECOQ/.svn/entries
  25. 24 0
      RCCE_V2.0/apps/ECOQ/.svn/text-base/Makefile.svn-base
  26. 332 0
      RCCE_V2.0/apps/ECOQ/.svn/text-base/RCCE_eco_q.c.svn-base
  27. 23 0
      RCCE_V2.0/apps/ECOQ/.svn/text-base/RCCE_pwr_wq.h.svn-base
  28. 156 0
      RCCE_V2.0/apps/ECOQ/.svn/text-base/RCCE_pwr_wq_framework.c.svn-base
  29. 2 0
      RCCE_V2.0/apps/ECOQ/.svn/text-base/README.svn-base
  30. 24 0
      RCCE_V2.0/apps/ECOQ/Makefile
  31. 332 0
      RCCE_V2.0/apps/ECOQ/RCCE_eco_q.c
  32. 23 0
      RCCE_V2.0/apps/ECOQ/RCCE_pwr_wq.h
  33. 156 0
      RCCE_V2.0/apps/ECOQ/RCCE_pwr_wq_framework.c
  34. 2 0
      RCCE_V2.0/apps/ECOQ/README
  35. 17 0
      RCCE_V2.0/apps/FLUSH/.svn/all-wcprops
  36. 96 0
      RCCE_V2.0/apps/FLUSH/.svn/entries
  37. 19 0
      RCCE_V2.0/apps/FLUSH/.svn/text-base/Makefile.svn-base
  38. 90 0
      RCCE_V2.0/apps/FLUSH/.svn/text-base/RCCE_test_cacheable.c.svn-base
  39. 19 0
      RCCE_V2.0/apps/FLUSH/Makefile
  40. 90 0
      RCCE_V2.0/apps/FLUSH/RCCE_test_cacheable.c
  41. 17 0
      RCCE_V2.0/apps/HELLO/.svn/all-wcprops
  42. 96 0
      RCCE_V2.0/apps/HELLO/.svn/entries
  43. 19 0
      RCCE_V2.0/apps/HELLO/.svn/text-base/Makefile.svn-base
  44. 37 0
      RCCE_V2.0/apps/HELLO/.svn/text-base/RCCE_hello.c.svn-base
  45. 19 0
      RCCE_V2.0/apps/HELLO/Makefile
  46. 37 0
      RCCE_V2.0/apps/HELLO/RCCE_hello.c
  47. BIN
      RCCE_V2.0/apps/HELLO/RCCE_hello.o
  48. 11 0
      RCCE_V2.0/apps/NPB/.svn/all-wcprops
  49. 77 0
      RCCE_V2.0/apps/NPB/.svn/entries
  50. 50 0
      RCCE_V2.0/apps/NPB/.svn/text-base/Makefile.svn-base
  51. 179 0
      RCCE_V2.0/apps/NPB/BT/.svn/all-wcprops
  52. 1014 0
      RCCE_V2.0/apps/NPB/BT/.svn/entries
  53. 65 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/Makefile.svn-base
  54. 44 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/add.c.svn-base
  55. 34 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/adi.c.svn-base
  56. 8 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/applu_macros.h.svn-base
  57. 38 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/applu_protos.h.svn-base
  58. 60 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/applu_share.h.svn-base
  59. 216 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/bt.c.svn-base
  60. 338 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/copy_faces.c.svn-base
  61. 78 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/define.c.svn-base
  62. 121 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/error.c.svn-base
  63. 375 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/exact_rhs.c.svn-base
  64. 43 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/exact_solution.c.svn-base
  65. 287 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/header.h.svn-base
  66. 321 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/initialize.c.svn-base
  67. 5 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/inputbt.data.sample.svn-base
  68. 222 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/make_set.c.svn-base
  69. 34 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/mpinpb.h.svn-base
  70. 104 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/print_results.c.svn-base
  71. 439 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/rhs.c.svn-base
  72. 220 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/set_constants.c.svn-base
  73. 60 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/setup_mpi.c.svn-base
  74. 647 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/solve_subs.c.svn-base
  75. 59 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/timers.c.svn-base
  76. 4 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/timers.h.svn-base
  77. 380 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/verify.c.svn-base
  78. 33 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/work_lhs.h.svn-base
  79. 632 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/x_solve.c.svn-base
  80. 646 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/y_solve.c.svn-base
  81. 689 0
      RCCE_V2.0/apps/NPB/BT/.svn/text-base/z_solve.c.svn-base
  82. 65 0
      RCCE_V2.0/apps/NPB/BT/Makefile
  83. 44 0
      RCCE_V2.0/apps/NPB/BT/add.c
  84. 34 0
      RCCE_V2.0/apps/NPB/BT/adi.c
  85. 8 0
      RCCE_V2.0/apps/NPB/BT/applu_macros.h
  86. 38 0
      RCCE_V2.0/apps/NPB/BT/applu_protos.h
  87. 60 0
      RCCE_V2.0/apps/NPB/BT/applu_share.h
  88. 216 0
      RCCE_V2.0/apps/NPB/BT/bt.c
  89. 338 0
      RCCE_V2.0/apps/NPB/BT/copy_faces.c
  90. 78 0
      RCCE_V2.0/apps/NPB/BT/define.c
  91. 121 0
      RCCE_V2.0/apps/NPB/BT/error.c
  92. 375 0
      RCCE_V2.0/apps/NPB/BT/exact_rhs.c
  93. 43 0
      RCCE_V2.0/apps/NPB/BT/exact_solution.c
  94. 287 0
      RCCE_V2.0/apps/NPB/BT/header.h
  95. 321 0
      RCCE_V2.0/apps/NPB/BT/initialize.c
  96. 5 0
      RCCE_V2.0/apps/NPB/BT/inputbt.data.sample
  97. 222 0
      RCCE_V2.0/apps/NPB/BT/make_set.c
  98. 34 0
      RCCE_V2.0/apps/NPB/BT/mpinpb.h
  99. 104 0
      RCCE_V2.0/apps/NPB/BT/print_results.c
  100. 0 0
      RCCE_V2.0/apps/NPB/BT/rhs.c

+ 1 - 1
Makefile

@@ -7,7 +7,7 @@
 ifeq ($(PLATFORM),SCC)
 	CFLAGS = -Wall -g
 	SHELL=sh
-	RCCEROOT=../../bRCCE_V2.0
+	RCCEROOT=./bRCCE_V2.0
 	include $(RCCEROOT)/common/symbols
 	PLATFORM_INCLUDES = $(RCCEINCLUDE)/RCCE.h
 	MY_FLAGS += -DPLAT_SCC

+ 47 - 0
RCCE_V2.0/.svn/all-wcprops

@@ -0,0 +1,47 @@
+K 25
+svn:wc:ra_dav:version-url
+V 43
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0
+END
+run_stress_test
+K 25
+svn:wc:ra_dav:version-url
+V 59
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/run_stress_test
+END
+COPYING
+K 25
+svn:wc:ra_dav:version-url
+V 51
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/COPYING
+END
+sourcing
+K 25
+svn:wc:ra_dav:version-url
+V 52
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/sourcing
+END
+Makefile
+K 25
+svn:wc:ra_dav:version-url
+V 52
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/Makefile
+END
+README
+K 25
+svn:wc:ra_dav:version-url
+V 50
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/README
+END
+build_stress_test
+K 25
+svn:wc:ra_dav:version-url
+V 61
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/build_stress_test
+END
+configure
+K 25
+svn:wc:ra_dav:version-url
+V 53
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/configure
+END

+ 290 - 0
RCCE_V2.0/.svn/entries

@@ -0,0 +1,290 @@
+10
+
+dir
+313
+http://marcbug.scc-dc.com/svn/repository/tags/RCCE_V2.0
+http://marcbug.scc-dc.com/svn/repository
+
+
+
+2012-01-10T18:47:23.474723Z
+297
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+c924d837-3317-4ba4-8fbd-5f2da8699d51
+
+COPYING
+file
+
+
+
+
+2012-10-27T13:42:44.532598Z
+cfbe8de91e3af34fbef42d72d9634772
+2010-06-25T23:28:47.346002Z
+7
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+632
+
+bin
+dir
+
+man
+dir
+
+Makefile
+file
+
+
+
+
+2012-10-27T13:42:44.532598Z
+6a9f5ce78890306001f18338b1c92b5b
+2011-08-03T21:54:30.632236Z
+242
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4971
+
+configure
+file
+
+
+
+
+2012-10-27T13:42:44.532598Z
+b1ee980b729ce38a0efa700a2c11d334
+2012-01-07T00:22:43.621517Z
+295
+tekubasx
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3167
+
+include
+dir
+
+src
+dir
+
+common
+dir
+
+sourcing
+file
+
+
+
+
+2012-10-27T13:42:44.532598Z
+3a02a4f19dbba66e5d8349b0998a5dc9
+2010-08-27T16:04:17.032086Z
+45
+tekubasx
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+84
+
+README
+file
+
+
+
+
+2012-10-27T13:42:44.532598Z
+3a4fad3d2b9fdf74aff8d883184cfb90
+2012-01-07T00:47:20.951110Z
+296
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6682
+
+utils
+dir
+
+hosts
+dir
+
+build_stress_test
+file
+
+
+
+
+2012-10-27T13:42:44.532598Z
+5c2f69e9213cc23350557c9463ac070f
+2010-06-25T23:28:47.346002Z
+7
+tekubasx
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2649
+
+apps
+dir
+
+run_stress_test
+file
+
+
+
+
+2012-10-27T13:42:44.532598Z
+b1a6ad1e7238f3ed92ea363f4070d1f7
+2010-06-25T23:28:47.346002Z
+7
+tekubasx
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4605
+

+ 5 - 0
RCCE_V2.0/.svn/prop-base/build_stress_test.svn-base

@@ -0,0 +1,5 @@
+K 14
+svn:executable
+V 0
+
+END

+ 5 - 0
RCCE_V2.0/.svn/prop-base/configure.svn-base

@@ -0,0 +1,5 @@
+K 14
+svn:executable
+V 0
+
+END

+ 5 - 0
RCCE_V2.0/.svn/prop-base/run_stress_test.svn-base

@@ -0,0 +1,5 @@
+K 14
+svn:executable
+V 0
+
+END

+ 5 - 0
RCCE_V2.0/.svn/prop-base/sourcing.svn-base

@@ -0,0 +1,5 @@
+K 14
+svn:executable
+V 1
+*
+END

+ 15 - 0
RCCE_V2.0/.svn/text-base/COPYING.svn-base

@@ -0,0 +1,15 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 

+ 116 - 0
RCCE_V2.0/.svn/text-base/Makefile.svn-base

@@ -0,0 +1,116 @@
+include common/symbols
+
+ifeq ($(OMP_EMULATOR),0)
+  PLATFORMOBJS=SCC_API.o   
+else
+  PLATFORMOBJS=RCCE_emulator_driver.o
+endif
+
+ifeq ($(PWRMGMT),1)
+  POWEROBJS=RCCE_power_management.o
+endif
+
+ARCHIVEOBJS= RCCE_admin.o RCCE_comm.o   RCCE_malloc.o RCCE_qsort.o RCCE_synch.o RCCE_flags.o  \
+             RCCE_send.o  RCCE_recv.o   RCCE_debug.o  RCCE_get.o   RCCE_put.o   RCCE_reduce.o \
+             RCCE_bcast.o RCCE_shmalloc.o RCCE_DCMflush.o $(PLATFORMOBJS) $(POWEROBJS)
+
+ifeq ($(OMP_EMULATOR),0)
+	ARCHIVEOBJS += RCCE_memcpy.o
+endif
+
+$(ARCHIVE): $(ARCHIVEOBJS)
+	@echo Archive name = $(ARCHIVE) 
+	ar -r $(ARCHIVE) $(ARCHIVEOBJS) 
+	rm -f *.o
+
+usage:
+	@echo "         make [OMP_EMULATOR=0] [PWRMGMT=1] [API=gory]  [SINGLEBITFLAGS=1]"
+	@echo "         make [clean] [veryclean]" 
+	@echo "default: make  OMP_EMULATOR=1   PWRMGMT=0   API=nongory SINGLEBITFLAGS=0"
+
+RCCE_admin.o: $(RCCE_LIB_SRC)/RCCE_admin.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h \
+        $(RCCEINCLUDE)/RCCE_lib_pwr.h
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_admin.c  $(RCCE_FLAGS) 
+
+RCCE_power_management.o: $(RCCE_LIB_SRC)/RCCE_power_management.c $(RCCEINCLUDE)/RCCE.h \
+         $(RCCEINCLUDE)/RCCE_lib.h $(RCCEINCLUDE)/SCC_API.h $(RCCEINCLUDE)/RCCE_lib_pwr.h
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_power_management.c  $(RCCE_FLAGS) 
+
+RCCE_debug.o: $(RCCE_LIB_SRC)/RCCE_debug.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h \
+         $(RCCEINCLUDE)/RCCE_debug.h
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_debug.c  $(RCCE_FLAGS)
+
+RCCE_comm.o: $(RCCE_LIB_SRC)/RCCE_comm.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_comm.c  $(RCCE_FLAGS)
+
+RCCE_send.o: $(RCCE_LIB_SRC)/RCCE_send.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_send.c  $(RCCE_FLAGS)
+
+RCCE_recv.o: $(RCCE_LIB_SRC)/RCCE_recv.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_recv.c  $(RCCE_FLAGS)
+
+RCCE_memcpy.o: $(RCCE_LIB_SRC)/RCCE_memcpy.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_memcpy.c  $(RCCE_FLAGS)
+
+RCCE_get.o: $(RCCE_LIB_SRC)/RCCE_get.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_get.c  $(RCCE_FLAGS)
+
+RCCE_put.o: $(RCCE_LIB_SRC)/RCCE_put.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_put.c  $(RCCE_FLAGS)
+
+RCCE_reduce.o: $(RCCE_LIB_SRC)/RCCE_reduce.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_reduce.c  $(RCCE_FLAGS)
+
+RCCE_bcast.o: $(RCCE_LIB_SRC)/RCCE_bcast.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_bcast.c  $(RCCE_FLAGS)
+
+RCCE_malloc.o: $(RCCE_LIB_SRC)/RCCE_malloc.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_malloc.c  $(RCCE_FLAGS)
+
+RCCE_shmalloc.o: $(RCCE_LIB_SRC)/RCCE_shmalloc.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_shmalloc.c  $(RCCE_FLAGS)
+
+RCCE_qsort.o: $(RCCE_LIB_SRC)/RCCE_qsort.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_qsort.c  $(RCCE_FLAGS)
+
+RCCE_synch.o: $(RCCE_LIB_SRC)/RCCE_synch.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_synch.c  $(RCCE_FLAGS)
+
+RCCE_flags.o: $(RCCE_LIB_SRC)/RCCE_flags.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_flags.c  $(RCCE_FLAGS)
+
+RCCE_emulator_driver.o: $(RCCE_LIB_SRC)/RCCE_emulator_driver.c $(RCCEINCLUDE)/RCCE.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_emulator_driver.c $(RCCE_FLAGS) 
+
+SCC_API.o: $(RCCE_LIB_SRC)/SCC_API.c $(RCCEINCLUDE)/SCC_API.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/SCC_API.c
+
+RCCE_DCMflush.o: $(RCCE_LIB_SRC)/RCCE_DCMflush.c $(RCCEINCLUDE)/SCC_API.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_DCMflush.c
+
+mpb.o: $(RCCE_LIB_SRC)/mpb.c $(RCCEINCLUDE)/SCC_API.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/mpb.c
+
+mpb: mpb.o SCC_API.o
+	$(CCOMPILE) $(CFLAGS) mpb.o SCC_API.o -o $(RCCEROOT$)/bin/$(SUBDIR)/mpb
+	rm -f *.o
+
+clean:
+	rm -f $(ARCHIVE) $(ARCHIVEOBJS)
+	rm -f mpb.o $(RCCEROOT$)/bin/$(SUBDIR)/mpb
+	rm -f bin/*/*.a
+
+veryclean: 
+	rm -f $(ARCHIVE) $(ARCHIVEOBJS)
+	rm -f mpb.o $(RCCEROOT$)/bin/$(SUBDIR)/mpb
+	rm -f bin/*/*.a
+	cd apps/SHIFT;    make clean; cd -
+	cd apps/STENCIL;  make clean; cd -
+	cd apps/NPB;      make clean; cd -
+	cd apps/PINGPONG; make clean; cd -
+	cd apps/XHPL;     make veryclean; cd -
+	cd apps/SHARE;    make clean; cd -
+	rm -f common/symbols rccerun makeall
+	@echo --------------------------------------------------------------------
+	@echo RUN \"configure\" SCRIPT \(AGAIN\) BEFORE MAKING EXECUTABLES + LIBRARIES
+	@echo --------------------------------------------------------------------        

+ 153 - 0
RCCE_V2.0/.svn/text-base/README.svn-base

@@ -0,0 +1,153 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//
+
+Welcome to RCCE, a communication environment for the SCC processor 
+------------------------------------------------------------------
+
+RCCE is designed to run on a variety of platforms including:
+
+  * Baremetal on the SCC chip, 
+  * Linux on the SCC chip, 
+  * A functional emulator running on top of OpenMP.  
+
+============ test line ============
+This particular release has been validated only for Linux and the 
+OpenMP emulator (a baremetal build option is available, but has
+not been tested). It may seem a bit cumbersome to work with, but 
+that's so we can replicate the "features" of the SCC chip ... i.e. 
+once a program runs on the emulator, it's likely to work on real 
+hardware.
+
+There are several versions of the RCCE library that can be 
+built with this release.  They expose different options ...
+
+   * The "gory" interface ... this is the low level interface. It 
+     makes the programmer responsible for declaring and managing
+     synchronization flags and for managing the on-chip message 
+     passing buffer. This mode gives access to the low level 
+     get/put routines,  as well as to the higher level two-sided 
+     send/receive interface.
+
+   * The "nongory" interface ... a higher level interface that
+     hides the particulars of the message passing buffers and 
+     inter-core synchronization from the programmer, including the 
+     management of synchronization flags. This interface does not 
+     give access to the low level put/get routines.
+
+   * Big Flags ... each flag used to coordinate interaction 
+     between units of execution (UE) takes up a byte in a single cacheline. 
+     This has lower latency but wastes memory.
+
+   * Small flags ... flags are stored in a single bit; many are
+     packed into a single cache line. A slight hit on latency but 
+     consumes less message passing buffer memory.
+
+   * With or without software controlled power management. POWER
+     MANAGEMENT IS AN EXPERIMENTAL FEATURE THAT HAS NOT BEEN TESTED 
+     AS THOROUGHLY AS THE REST OF THE LIBRARY. FOLLOW THE SPECIAL
+     INSTRUCTIONS BELOW TO CONFIGURE THE "makeall" SCRIPT TO BUILD 
+     VERSIONS OF THE LIBRARY THAT INCLUDE THE POWER MANAGEMENT API.
+
+
+You can build all versions of the library supported with this 
+release, as follows:
+
+1. Type "./configure <PLATFORM>". This creates file common/symbols 
+   from file common/symbols.in, inserting the proper root of the 
+   directory tree, and also inserting the proper platform (SCC_LINUX,
+   SCC_BAREMETAL, or emulator). Any existing file common/symbols 
+   will be overwritten, so do not update that file by hand. Instead,
+   specify details of your build environment in common/symbols.in
+   The configure utility also specializes the rccerun command (see 
+   below) for the target platform. You may need to make the configure
+   script executable (type "chmod u+x configure").
+   To enable RCCE's power management API, you must specify the string
+   "ADD_POWER_API" as the second parameter on the command line when 
+   you execute the configure script. Because this is an experimental 
+   feature, it is not built by default. See above.
+
+2. Type "./makeall" to build all libraries. Alternatively you 
+   can build individual libraries by calling make directly.  
+   Type "make usage" to discover the libraries you can build.
+
+The libraries generated by this procedure will be put in the directory
+
+    bin/<PLATFORM>
+
+An easy way to test correct operation of the platform is to build and
+run a prepackaged RCCE stress test after building the RCCE library:
+   "./build_stress_test; ./run_stress_test <size>"
+where size is -S (small), -M (medium), or -L (large).
+
+A number of applications are included with this release in the
+"apps" directory.  These include:  
+
+  * PINGPONG:  bounces messages between a pair of UEs
+  * SHIFT:     passes messages around a logical ring of UEs
+  * STENCIL:   solves a simple PDE with a basic stencil code
+  * SHARE:     tests the off-chip shared memory access
+  * NPB:       NAS Parallel Benchmarks, LU and BT
+  * XHPL:      the Linpack benchmark
+
+To build an application,  go to the corresponding subdirectory of "apps" 
+and type "make".   It will return a list of options for building
+versions of an application. It may be necessary to edit the Makefile
+in an application directory if parts of the original RCCE code tree 
+got moved with respect to each other.
+
+We suggest that you start with PINGPONG, SHIFT, SHARE, and STENCIL and 
+save the more complex NPB and XHPL for later. The STENCIl directory 
+contains a few simple variations of the base code that exercise RCCE's 
+experimental power management API.
+See the apps/XHPL directory for instructions on how to build and run
+Linpack.
+
+To run an application, you must use the rccerun command.  This command
+is used to launch Linux jobs on SCC or on the emulator. To run application 
+APP with P cores, type
+
+"rccerun -nue P -f HOSTFILE APP [application parameters]"
+
+where HOSTFILE contains the list of physical core IDs to be used. 
+By default, the host file "./hosts/rc.hosts" should be used. 
+You can see an example of the use of rccerun in the run_stencil and 
+run_stencil_synch shell scripts in the "apps/STENCIL" directory. 
+
+If my_script is a shell script that contains the actual RCCE executable 
+RCCE_X (which may take application parameters), make sure to execute it as 
+follows inside the script: "/path_to_RCCE_X/RCCE_X $@" This is necessary 
+so that all the parameters to the program, including those added by
+rccerun, are supplied to the executable.
+
+Example: 
+Shell script my_script contains executable RCCE_X that expects two 
+parameters, n and m. Write the script as:
+----------start of my_script---------
+line 1
+line 2
+line ...
+./RCCE_X $@
+line ...
+----------end of my_script-----------
+To run the code on P cores of the SCC, type
+
+"rccerun -nue P -f HOSTFILE my_script m n"
+
+MANPAGES
+This release of RCCE has manpages. To access those manpages, add a path to 
+your MANPATH as follows:
+export MANPATH="<path to where you installed RCCE>/man:${MANPATH}"

+ 79 - 0
RCCE_V2.0/.svn/text-base/build_stress_test.svn-base

@@ -0,0 +1,79 @@
+#!/bin/bash
+#  
+#  Copyright 2010 Intel Corporation
+#  
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#  
+#         http://www.apache.org/licenses/LICENSE-2.0
+#  
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
+#  
+NUMPARS=$# 
+PWRMGMT=0
+
+SHARED=/shared/`whoami`
+BIN_STRESS=$SHARED/bin_stress
+PAR=1 
+while [ $PAR -le $NUMPARS ]; do 
+  eval OPT=\$$PAR 
+  case $OPT in 
+    -PWRMGMT  ) PWRMGMT=1                           ;; 
+    -CLEAN    ) rm -rf $BIN_STRESS           ; exit ;;
+     *        ) echo Error, wrong option $OPT; exit ;;
+  esac   
+  PAR=`expr $PAR + 1`                                                    
+done 
+
+if [ ! \( -d $SHARED \) ]; then 
+  echo Creating directory $SHARED
+  mkdir $SHARED 
+fi
+if [ ! \( -d $BIN_STRESS \) ]; then 
+  echo Creating directory $BIN_STRESS
+  mkdir $BIN_STRESS 
+fi
+
+cd apps/STENCIL
+  make stencil_synch;                  mv stencil_synch $BIN_STRESS/stencil
+  make SINGLEBITFLAGS=1 stencil_synch; mv stencil_synch $BIN_STRESS/stencil_1b
+cd -
+
+cd apps/PINGPONG; 
+  make pingpong; mv pingpong $BIN_STRESS
+cd -
+
+cd apps/NPB
+  make bt CLASS=S NPROCS=4;                   mv BT/bt.S.4  $BIN_STRESS/bt.S.4
+  make bt CLASS=W NPROCS=16;                  mv BT/bt.W.16 $BIN_STRESS/bt.W.16
+  make bt CLASS=W NPROCS=36;                  mv BT/bt.W.36 $BIN_STRESS/bt.W.36
+  make bt SINGLEBITFLAGS=1 CLASS=S NPROCS=4;  mv BT/bt.S.4  $BIN_STRESS/bt.S.4_1b
+  make bt SINGLEBITFLAGS=1 CLASS=W NPROCS=16; mv BT/bt.W.16 $BIN_STRESS/bt.W.16_1b
+  make bt SINGLEBITFLAGS=1 CLASS=W NPROCS=36; mv BT/bt.W.36 $BIN_STRESS/bt.W.36_1b
+cd -
+
+if [ $PWRMGMT -eq 1 ]; then
+  cd apps/STENCIL
+     make PWRMGMT=1 pstencil;    mv pstencil    $BIN_STRESS
+     make PWRMGMT=1 power_reset; mv power_reset $BIN_STRESS
+     make PWRMGMT=1 Fdiv;        mv Fdiv        $BIN_STRESS
+     make PWRMGMT=1 FV;          mv FV          $BIN_STRESS
+  cd -
+fi
+
+cp rccerun $BIN_STRESS
+cp hosts/rc.hosts $BIN_STRESS/allhosts
+cat hosts/rc.hosts | sort -r > $BIN_STRESS/allhosts_reverse
+echo 00 >  $BIN_STRESS/2hosts_1tile
+echo 01 >> $BIN_STRESS/2hosts_1tile
+echo 00 >  $BIN_STRESS/2hosts_nbr_tiles
+echo 02 >> $BIN_STRESS/2hosts_nbr_tiles
+echo 00 >  $BIN_STRESS/2hosts_faraway_tiles
+echo 47 >> $BIN_STRESS/2hosts_faraway_tiles
+
+

+ 91 - 0
RCCE_V2.0/.svn/text-base/configure.svn-base

@@ -0,0 +1,91 @@
+#!/bin/bash
+#
+#  Copyright 2010 Intel Corporation
+#  
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#  
+#         http://www.apache.org/licenses/LICENSE-2.0
+#  
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
+#
+PID=$$
+if [ $# -lt 1 ]; then
+  echo "Usage: $0 emulator"
+  echo "       $0 SCC_LINUX"
+  echo "       $0 SCC_LINUX ADD_POWER_API"
+  echo "       $0 SCC_BAREMETAL"
+  echo "See README for power management options"
+  exit
+fi
+if [ "$1" = "SCC_LINUX" ] || [ "$1" = "SCC_BAREMETAL" ]; then
+  if [ "$1" = "SCC_LINUX" ]; then BAREMETAL=0; else BAREMETAL=1; fi
+  OMP_EMULATOR=0
+  MAKE_MPB="make mpb"
+else
+  MAKE_MPB=""
+  PSSH_MPB_C=""
+  PSSH_MPB_CL=""
+  if [ "$1" = "emulator" ]; then
+    OMP_EMULATOR=1
+  else
+    echo Incorrect platform: $1
+    exit 1
+  fi
+fi
+
+POWERPARS="0"
+if [ $# -eq 2 ] && [ "$2" = "ADD_POWER_API" ]; then
+  POWERPARS="0 1"
+fi
+
+ROOT=`pwd`
+COMFILE=common/symbols
+echo "#########################################################" >  $COMFILE
+echo "# DO NOT EDIT BY HAND!! This file gets overwritten each #" >> $COMFILE
+echo "# time the configure script is run. Insert any changes  #" >> $COMFILE
+echo "# in file common/symbols.in instead.                    #" >> $COMFILE
+echo "#########################################################" >> $COMFILE
+echo ""                                                          >> $COMFILE
+
+#note: must use colon for sed separator; slash conflicts with symbol(s) in path
+cat $COMFILE.in | sed "s:_INSERT_BMVAL_INSERT_:${BAREMETAL}:" | \
+                  sed "s:_INSERT_ROOTDIR_INSERT_:${ROOT}:" | \
+                  sed "s:_INSERT_EMVAL_INSERT_:${OMP_EMULATOR}:" >> $COMFILE
+
+RUNFILE=utils/rccerun
+cat $RUNFILE.in | sed "s:_INSERT_ROOTDIR_INSERT_:${ROOT}:" | \
+                  sed "s:_INSERT_BINDIR_INSERT_:$1:"       | \
+                  sed "s:_INSERT_EMVAL_INSERT_:${OMP_EMULATOR}:" > $RUNFILE
+chmod u+x $RUNFILE
+mv $RUNFILE .
+
+MAKEALL=utils/makeall
+cat $MAKEALL.in | sed "s:_INSERT_POWERPARS_INSERT_:${POWERPARS}:" | \
+                  sed "s:_INSERT_MAKE_MPB_:${MAKE_MPB}:" > $MAKEALL
+chmod u+x $MAKEALL
+mv $MAKEALL .
+
+#create scripts for killing processes on the cores and the MCPC containing a user specified string
+#note: this only makes sense on the SCC platform itself, not the emulator
+if [ "$OMP_EMULATOR" -eq 0 ]; then
+  SHARED=/shared/`whoami`
+  if [ ! \( -d $SHARED \) ]; then 
+    echo Creating directory $SHARED
+    mkdir $SHARED 
+  fi
+  
+  KILLIT=utils/killit
+  cat $KILLIT.in | sed "s:_INSERT_SHAREDDIR_INSERT_:${SHARED}:" > $KILLIT
+  KILLCORE=utils/killcorePIDs
+  cat $KILLCORE.in | sed "s:_INSERT_SHAREDDIR_INSERT_:${SHARED}:" > $KILLCORE
+  chmod u+x $KILLIT $KILLCORE
+  ALLHOSTS=utils/allhosts
+  mv $KILLIT $KILLCORE $SHARED
+  cp  $ALLHOSTS $SHARED
+fi

+ 127 - 0
RCCE_V2.0/.svn/text-base/run_stress_test.svn-base

@@ -0,0 +1,127 @@
+#!/bin/bash
+#  
+#  Copyright 2010 Intel Corporation
+#  
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#  
+#         http://www.apache.org/licenses/LICENSE-2.0
+#  
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
+#  
+NUMPARS=$# 
+PID=$$
+LOG=`pwd`/log.$PID
+SUM=`pwd`/summary.$PID
+SCRATCH=.scratch.$PID
+SIZE="UNKNOWN"
+PWRMGMT=0
+# change the tile clock frequency if using a tile clock divider other than 3
+GHZ=0.533
+
+PAR=1 
+while [ $PAR -le $NUMPARS ]; do 
+  eval OPT=\$$PAR 
+  case $OPT in 
+    -S        ) if [ $SIZE = "UNKNOWN" ]; then SIZE=SMALL;  fi ;; 
+    -M        ) if [ $SIZE = "UNKNOWN" ]; then SIZE=MEDIUM; fi ;; 
+    -L        ) if [ $SIZE = "UNKNOWN" ]; then SIZE=LARGE;  fi ;; 
+    -PWRMGMT  ) PWRMGMT=1     ;; 
+     *        ) echo Error, wrong option $OPT | tee -a $LOG      | tee -a $SUM 
+                exit ;;
+  esac                                                       
+  PAR=`expr $PAR + 1`
+done 
+
+if [ $SIZE = "UNKNOWN" ]; then 
+  echo ERROR: No size specified \(-S, -M, or -L\)  | tee -a $LOG | tee -a $SUM
+  exit
+else
+  echo Executing RCCE stress test of size $SIZE    | tee -a $LOG | tee -a $SUM
+fi
+
+BIN_STRESS=/shared/`whoami`/bin_stress
+if [ ! \( -d $BIN_STRESS \) ]; then
+  echo ERROR: No stress test directory  | tee -a $LOG | tee -a $SUM 
+  echo Please create and populate it by invoking \"build_stress_test\" \
+        | tee -a $LOG | tee -a $SUM
+  exit
+fi
+
+cd $BIN_STRESS
+
+for HXT in 1tile nbr_tiles faraway_tiles; do
+  case $SIZE in
+    SMALL  ) ITERS=10;;
+    MEDIUM ) ITERS=1000;;
+    LARGE  ) ITERS=100000;;
+  esac
+  echo ./rccerun -nue 2 -f 2hosts_$HXT -clock $GHZ pingpong $ITERS \
+             | tee -a $LOG | tee -a $SUM
+       ./rccerun -nue 2 -f 2hosts_$HXT -clock $GHZ pingpong $ITERS \
+             | tee -a $LOG | tee  $SCRATCH 
+  grep -i latency $SCRATCH >> $SUM 
+done
+
+for EXT in "" "_1b"; do
+  if [ "$EXT" = "_1b" ]; then
+    echo Using single bit flags | tee -a $LOG | tee -a $SUM
+  fi
+  for HOSTS in allhosts allhosts_reverse; do
+    case $SIZE in
+      SMALL  ) BTCORES=4;  CLASS=S; STCORES=4;  STITERS=10;;
+      MEDIUM ) BTCORES=16; CLASS=W; STCORES=16; STITERS=100;;
+      LARGE  ) BTCORES=36; CLASS=W; STCORES=48; STITERS=1000;;
+    esac
+
+    echo ./rccerun -nue $STCORES -f $HOSTS -clock $GHZ stencil$EXT $STITERS    \
+               | tee -a $LOG | tee -a $SUM
+         ./rccerun -nue $STCORES -f $HOSTS -clock $GHZ stencil$EXT $STITERS    \
+               | tee -a $LOG | tee  $SCRATCH 
+    grep Verification $SCRATCH | grep SUCCESSFUL >> $SUM 
+    echo ./rccerun -nue $BTCORES -f $HOSTS -clock $GHZ bt.$CLASS.$BTCORES$EXT  \
+               | tee -a $LOG | tee -a $SUM
+         ./rccerun -nue $BTCORES -f $HOSTS -clock $GHZ bt.$CLASS.$BTCORES$EXT  \
+               | tee -a $LOG | tee  $SCRATCH 
+    grep Verification $SCRATCH | grep SUCCESSFUL >> $SUM 
+  done
+  if [ "$EXT" = "_1b" ]; then
+    echo End using single bit flags | tee -a $LOG | tee -a $SUM
+  fi
+done
+
+if [ $PWRMGMT -eq 1 ]; then
+  case $SIZE in
+    SMALL  ) NC=1;;
+    MEDIUM ) NC=8 ;;
+    LARGE  ) NC=48 ;;
+  esac
+  echo ./rccerun -nue $NC -f allhosts Fdiv 4                        \
+             | tee -a $LOG | tee -a $SUM
+       ./rccerun -nue $NC -f allhosts Fdiv 4                        \
+             | tee -a $LOG | tee  $SCRATCH 
+  grep Verification $SCRATCH | grep SUCCESSFUL >> $SUM 
+  echo ./rccerun -nue $NC -f allhosts FV 3                          \
+             | tee -a $LOG | tee -a $SUM
+       ./rccerun -nue $NC -f allhosts FV 3                          \
+             | tee -a $LOG | tee  $SCRATCH 
+  grep Verification $SCRATCH | grep SUCCESSFUL >> $SUM 
+  echo ./rccerun -nue $NC -f allhosts power_reset                   \
+             | tee -a $LOG | tee -a $SUM
+       ./rccerun -nue $NC -f allhosts power_reset                   \
+             | tee -a $LOG | tee  $SCRATCH 
+  grep Verification $SCRATCH | grep SUCCESSFUL >> $SUM 
+  echo ./rccerun -nue $NC -f allhosts pstencil                      \
+             | tee -a $LOG | tee -a $SUM
+       ./rccerun -nue $NC -f allhosts pstencil                      \
+             | tee -a $LOG | tee  $SCRATCH 
+  grep Verification $SCRATCH | grep SUCCESSFUL >> $SUM 
+fi
+
+rm $SCRATCH
+

+ 2 - 0
RCCE_V2.0/.svn/text-base/sourcing.svn-base

@@ -0,0 +1,2 @@
+source /shared/icc-8.1.038/bin/iccvars.sh
+source /shared/crosstool/crosstoolvars.sh

+ 15 - 0
RCCE_V2.0/COPYING

@@ -0,0 +1,15 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 

+ 116 - 0
RCCE_V2.0/Makefile

@@ -0,0 +1,116 @@
+include common/symbols
+
+ifeq ($(OMP_EMULATOR),0)
+  PLATFORMOBJS=SCC_API.o   
+else
+  PLATFORMOBJS=RCCE_emulator_driver.o
+endif
+
+ifeq ($(PWRMGMT),1)
+  POWEROBJS=RCCE_power_management.o
+endif
+
+ARCHIVEOBJS= RCCE_admin.o RCCE_comm.o   RCCE_malloc.o RCCE_qsort.o RCCE_synch.o RCCE_flags.o  \
+             RCCE_send.o  RCCE_recv.o   RCCE_debug.o  RCCE_get.o   RCCE_put.o   RCCE_reduce.o \
+             RCCE_bcast.o RCCE_shmalloc.o RCCE_DCMflush.o $(PLATFORMOBJS) $(POWEROBJS)
+
+ifeq ($(OMP_EMULATOR),0)
+	ARCHIVEOBJS += RCCE_memcpy.o
+endif
+
+$(ARCHIVE): $(ARCHIVEOBJS)
+	@echo Archive name = $(ARCHIVE) 
+	ar -r $(ARCHIVE) $(ARCHIVEOBJS) 
+	rm -f *.o
+
+usage:
+	@echo "         make [OMP_EMULATOR=0] [PWRMGMT=1] [API=gory]  [SINGLEBITFLAGS=1]"
+	@echo "         make [clean] [veryclean]" 
+	@echo "default: make  OMP_EMULATOR=1   PWRMGMT=0   API=nongory SINGLEBITFLAGS=0"
+
+RCCE_admin.o: $(RCCE_LIB_SRC)/RCCE_admin.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h \
+        $(RCCEINCLUDE)/RCCE_lib_pwr.h
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_admin.c  $(RCCE_FLAGS) 
+
+RCCE_power_management.o: $(RCCE_LIB_SRC)/RCCE_power_management.c $(RCCEINCLUDE)/RCCE.h \
+         $(RCCEINCLUDE)/RCCE_lib.h $(RCCEINCLUDE)/SCC_API.h $(RCCEINCLUDE)/RCCE_lib_pwr.h
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_power_management.c  $(RCCE_FLAGS) 
+
+RCCE_debug.o: $(RCCE_LIB_SRC)/RCCE_debug.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h \
+         $(RCCEINCLUDE)/RCCE_debug.h
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_debug.c  $(RCCE_FLAGS)
+
+RCCE_comm.o: $(RCCE_LIB_SRC)/RCCE_comm.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_comm.c  $(RCCE_FLAGS)
+
+RCCE_send.o: $(RCCE_LIB_SRC)/RCCE_send.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_send.c  $(RCCE_FLAGS)
+
+RCCE_recv.o: $(RCCE_LIB_SRC)/RCCE_recv.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_recv.c  $(RCCE_FLAGS)
+
+RCCE_memcpy.o: $(RCCE_LIB_SRC)/RCCE_memcpy.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_memcpy.c  $(RCCE_FLAGS)
+
+RCCE_get.o: $(RCCE_LIB_SRC)/RCCE_get.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_get.c  $(RCCE_FLAGS)
+
+RCCE_put.o: $(RCCE_LIB_SRC)/RCCE_put.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_put.c  $(RCCE_FLAGS)
+
+RCCE_reduce.o: $(RCCE_LIB_SRC)/RCCE_reduce.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_reduce.c  $(RCCE_FLAGS)
+
+RCCE_bcast.o: $(RCCE_LIB_SRC)/RCCE_bcast.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_bcast.c  $(RCCE_FLAGS)
+
+RCCE_malloc.o: $(RCCE_LIB_SRC)/RCCE_malloc.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_malloc.c  $(RCCE_FLAGS)
+
+RCCE_shmalloc.o: $(RCCE_LIB_SRC)/RCCE_shmalloc.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_shmalloc.c  $(RCCE_FLAGS)
+
+RCCE_qsort.o: $(RCCE_LIB_SRC)/RCCE_qsort.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_qsort.c  $(RCCE_FLAGS)
+
+RCCE_synch.o: $(RCCE_LIB_SRC)/RCCE_synch.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_synch.c  $(RCCE_FLAGS)
+
+RCCE_flags.o: $(RCCE_LIB_SRC)/RCCE_flags.c $(RCCEINCLUDE)/RCCE.h $(RCCEINCLUDE)/RCCE_lib.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_flags.c  $(RCCE_FLAGS)
+
+RCCE_emulator_driver.o: $(RCCE_LIB_SRC)/RCCE_emulator_driver.c $(RCCEINCLUDE)/RCCE.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_emulator_driver.c $(RCCE_FLAGS) 
+
+SCC_API.o: $(RCCE_LIB_SRC)/SCC_API.c $(RCCEINCLUDE)/SCC_API.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/SCC_API.c
+
+RCCE_DCMflush.o: $(RCCE_LIB_SRC)/RCCE_DCMflush.c $(RCCEINCLUDE)/SCC_API.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/RCCE_DCMflush.c
+
+mpb.o: $(RCCE_LIB_SRC)/mpb.c $(RCCEINCLUDE)/SCC_API.h 
+	$(CCOMPILE) -c $(CFLAGS) $(RCCE_LIB_SRC)/mpb.c
+
+mpb: mpb.o SCC_API.o
+	$(CCOMPILE) $(CFLAGS) mpb.o SCC_API.o -o $(RCCEROOT$)/bin/$(SUBDIR)/mpb
+	rm -f *.o
+
+clean:
+	rm -f $(ARCHIVE) $(ARCHIVEOBJS)
+	rm -f mpb.o $(RCCEROOT$)/bin/$(SUBDIR)/mpb
+	rm -f bin/*/*.a
+
+veryclean: 
+	rm -f $(ARCHIVE) $(ARCHIVEOBJS)
+	rm -f mpb.o $(RCCEROOT$)/bin/$(SUBDIR)/mpb
+	rm -f bin/*/*.a
+	cd apps/SHIFT;    make clean; cd -
+	cd apps/STENCIL;  make clean; cd -
+	cd apps/NPB;      make clean; cd -
+	cd apps/PINGPONG; make clean; cd -
+	cd apps/XHPL;     make veryclean; cd -
+	cd apps/SHARE;    make clean; cd -
+	rm -f common/symbols rccerun makeall
+	@echo --------------------------------------------------------------------
+	@echo RUN \"configure\" SCRIPT \(AGAIN\) BEFORE MAKING EXECUTABLES + LIBRARIES
+	@echo --------------------------------------------------------------------        

+ 153 - 0
RCCE_V2.0/README

@@ -0,0 +1,153 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//
+
+Welcome to RCCE, a communication environment for the SCC processor 
+------------------------------------------------------------------
+
+RCCE is designed to run on a variety of platforms including:
+
+  * Baremetal on the SCC chip, 
+  * Linux on the SCC chip, 
+  * A functional emulator running on top of OpenMP.  
+
+============ test line ============
+This particular release has been validated only for Linux and the 
+OpenMP emulator (a baremetal build option is available, but has
+not been tested). It may seem a bit cumbersome to work with, but 
+that's so we can replicate the "features" of the SCC chip ... i.e. 
+once a program runs on the emulator, it's likely to work on real 
+hardware.
+
+There are several versions of the RCCE library that can be 
+built with this release.  They expose different options ...
+
+   * The "gory" interface ... this is the low level interface. It 
+     makes the programmer responsible for declaring and managing
+     synchronization flags and for managing the on-chip message 
+     passing buffer. This mode gives access to the low level 
+     get/put routines,  as well as to the higher level two-sided 
+     send/receive interface.
+
+   * The "nongory" interface ... a higher level interface that
+     hides the particulars of the message passing buffers and 
+     inter-core synchronization from the programmer, including the 
+     management of synchronization flags. This interface does not 
+     give access to the low level put/get routines.
+
+   * Big Flags ... each flag used to coordinate interaction 
+     between units of execution (UE) takes up a byte in a single cacheline. 
+     This has lower latency but wastes memory.
+
+   * Small flags ... flags are stored in a single bit; many are
+     packed into a single cache line. A slight hit on latency but 
+     consumes less message passing buffer memory.
+
+   * With or without software controlled power management. POWER
+     MANAGEMENT IS AN EXPERIMENTAL FEATURE THAT HAS NOT BEEN TESTED 
+     AS THOROUGHLY AS THE REST OF THE LIBRARY. FOLLOW THE SPECIAL
+     INSTRUCTIONS BELOW TO CONFIGURE THE "makeall" SCRIPT TO BUILD 
+     VERSIONS OF THE LIBRARY THAT INCLUDE THE POWER MANAGEMENT API.
+
+
+You can build all versions of the library supported with this 
+release, as follows:
+
+1. Type "./configure <PLATFORM>". This creates file common/symbols 
+   from file common/symbols.in, inserting the proper root of the 
+   directory tree, and also inserting the proper platform (SCC_LINUX,
+   SCC_BAREMETAL, or emulator). Any existing file common/symbols 
+   will be overwritten, so do not update that file by hand. Instead,
+   specify details of your build environment in common/symbols.in
+   The configure utility also specializes the rccerun command (see 
+   below) for the target platform. You may need to make the configure
+   script executable (type "chmod u+x configure").
+   To enable RCCE's power management API, you must specify the string
+   "ADD_POWER_API" as the second parameter on the command line when 
+   you execute the configure script. Because this is an experimental 
+   feature, it is not built by default. See above.
+
+2. Type "./makeall" to build all libraries. Alternatively you 
+   can build individual libraries by calling make directly.  
+   Type "make usage" to discover the libraries you can build.
+
+The libraries generated by this procedure will be put in the directory
+
+    bin/<PLATFORM>
+
+An easy way to test correct operation of the platform is to build and
+run a prepackaged RCCE stress test after building the RCCE library:
+   "./build_stress_test; ./run_stress_test <size>"
+where size is -S (small), -M (medium), or -L (large).
+
+A number of applications are included with this release in the
+"apps" directory.  These include:  
+
+  * PINGPONG:  bounces messages between a pair of UEs
+  * SHIFT:     passes messages around a logical ring of UEs
+  * STENCIL:   solves a simple PDE with a basic stencil code
+  * SHARE:     tests the off-chip shared memory access
+  * NPB:       NAS Parallel Benchmarks, LU and BT
+  * XHPL:      the Linpack benchmark
+
+To build an application,  go to the corresponding subdirectory of "apps" 
+and type "make".   It will return a list of options for building
+versions of an application. It may be necessary to edit the Makefile
+in an application directory if parts of the original RCCE code tree 
+got moved with respect to each other.
+
+We suggest that you start with PINGPONG, SHIFT, SHARE, and STENCIL and 
+save the more complex NPB and XHPL for later. The STENCIl directory 
+contains a few simple variations of the base code that exercise RCCE's 
+experimental power management API.
+See the apps/XHPL directory for instructions on how to build and run
+Linpack.
+
+To run an application, you must use the rccerun command.  This command
+is used to launch Linux jobs on SCC or on the emulator. To run application 
+APP with P cores, type
+
+"rccerun -nue P -f HOSTFILE APP [application parameters]"
+
+where HOSTFILE contains the list of physical core IDs to be used. 
+By default, the host file "./hosts/rc.hosts" should be used. 
+You can see an example of the use of rccerun in the run_stencil and 
+run_stencil_synch shell scripts in the "apps/STENCIL" directory. 
+
+If my_script is a shell script that contains the actual RCCE executable 
+RCCE_X (which may take application parameters), make sure to execute it as 
+follows inside the script: "/path_to_RCCE_X/RCCE_X $@" This is necessary 
+so that all the parameters to the program, including those added by
+rccerun, are supplied to the executable.
+
+Example: 
+Shell script my_script contains executable RCCE_X that expects two 
+parameters, n and m. Write the script as:
+----------start of my_script---------
+line 1
+line 2
+line ...
+./RCCE_X $@
+line ...
+----------end of my_script-----------
+To run the code on P cores of the SCC, type
+
+"rccerun -nue P -f HOSTFILE my_script m n"
+
+MANPAGES
+This release of RCCE has manpages. To access those manpages, add a path to 
+your MANPATH as follows:
+export MANPATH="<path to where you installed RCCE>/man:${MANPATH}"

+ 17 - 0
RCCE_V2.0/apps/.svn/all-wcprops

@@ -0,0 +1,17 @@
+K 25
+svn:wc:ra_dav:version-url
+V 48
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps
+END
+hpl
+K 25
+svn:wc:ra_dav:version-url
+V 52
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/hpl
+END
+README
+K 25
+svn:wc:ra_dav:version-url
+V 55
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/README
+END

+ 123 - 0
RCCE_V2.0/apps/.svn/entries

@@ -0,0 +1,123 @@
+10
+
+dir
+313
+http://marcbug.scc-dc.com/svn/repository/tags/RCCE_V2.0/apps
+http://marcbug.scc-dc.com/svn/repository
+
+
+
+2011-04-11T21:00:28.037293Z
+188
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+c924d837-3317-4ba4-8fbd-5f2da8699d51
+
+NPB
+dir
+
+FLUSH
+dir
+
+PINGPONG
+dir
+
+STENCIL
+dir
+
+SHARE
+dir
+
+HELLO
+dir
+
+hpl
+file
+
+
+
+
+2012-10-27T13:42:43.852598Z
+65c154fb251179d50086a65103a70c47
+2010-07-16T00:05:27.319040Z
+32
+tekubasx
+has-props
+
+
+svn:special
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4
+
+ECOQ
+dir
+
+XHPL
+dir
+
+SHIFT
+dir
+
+README
+file
+
+
+
+
+2012-10-27T13:42:43.852598Z
+1854a34c919d87c38c13f947b88353ca
+2010-12-27T18:51:02.240775Z
+131
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+62
+

+ 5 - 0
RCCE_V2.0/apps/.svn/prop-base/hpl.svn-base

@@ -0,0 +1,5 @@
+K 11
+svn:special
+V 1
+*
+END

+ 1 - 0
RCCE_V2.0/apps/.svn/text-base/README.svn-base

@@ -0,0 +1 @@
+keep link hpl->XHPL intact, it is necessary to build LINPACK.

+ 1 - 0
RCCE_V2.0/apps/.svn/text-base/hpl.svn-base

@@ -0,0 +1 @@
+link XHPL

+ 35 - 0
RCCE_V2.0/apps/ECOQ/.svn/all-wcprops

@@ -0,0 +1,35 @@
+K 25
+svn:wc:ra_dav:version-url
+V 53
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/ECOQ
+END
+RCCE_pwr_wq_framework.c
+K 25
+svn:wc:ra_dav:version-url
+V 77
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/ECOQ/RCCE_pwr_wq_framework.c
+END
+Makefile
+K 25
+svn:wc:ra_dav:version-url
+V 62
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/ECOQ/Makefile
+END
+README
+K 25
+svn:wc:ra_dav:version-url
+V 60
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/ECOQ/README
+END
+RCCE_pwr_wq.h
+K 25
+svn:wc:ra_dav:version-url
+V 67
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/ECOQ/RCCE_pwr_wq.h
+END
+RCCE_eco_q.c
+K 25
+svn:wc:ra_dav:version-url
+V 66
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/ECOQ/RCCE_eco_q.c
+END

+ 198 - 0
RCCE_V2.0/apps/ECOQ/.svn/entries

@@ -0,0 +1,198 @@
+10
+
+dir
+313
+http://marcbug.scc-dc.com/svn/repository/tags/RCCE_V2.0/apps/ECOQ
+http://marcbug.scc-dc.com/svn/repository
+
+
+
+2011-03-01T00:12:36.950662Z
+165
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+c924d837-3317-4ba4-8fbd-5f2da8699d51
+
+RCCE_pwr_wq.h
+file
+
+
+
+
+2012-10-27T13:42:39.092598Z
+6a2d46e1d1c182bf4bd5d38b38e62987
+2011-03-01T00:11:03.930222Z
+164
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+572
+
+RCCE_eco_q.c
+file
+
+
+
+
+2012-10-27T13:42:39.092598Z
+dbdce8ea42af7845ecd31f6cd6457698
+2011-03-01T00:11:03.930222Z
+164
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+12193
+
+RCCE_pwr_wq_framework.c
+file
+
+
+
+
+2012-10-27T13:42:39.092598Z
+caee278009153ed29bd317577efadb6e
+2011-03-01T00:11:03.930222Z
+164
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6008
+
+Makefile
+file
+
+
+
+
+2012-10-27T13:42:39.092598Z
+6b3e73f6d2478f93649bed6388cf7605
+2011-03-01T00:11:03.930222Z
+164
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+521
+
+README
+file
+
+
+
+
+2012-10-27T13:42:39.092598Z
+8a3d52ef6996ffed15a25ad4b5d13f02
+2011-03-01T00:12:36.950662Z
+165
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+71
+

+ 24 - 0
RCCE_V2.0/apps/ECOQ/.svn/text-base/Makefile.svn-base

@@ -0,0 +1,24 @@
+SHELL=sh
+
+RCCEROOT=../..
+include $(RCCEROOT)/common/symbols
+
+ECOQOBJS=RCCE_eco_q.o RCCE_pwr_wq_framework.o $(ARCHIVE)
+
+default:
+	@echo Usage: make PWRMGMT=1 eco_q [clean]
+
+eco_q: $(ECOQOBJS)
+	$(CCOMPILE) -o eco_q $(ECOQOBJS) $(CFLAGS)
+
+RCCE_eco_q.o: RCCE_eco_q.c $(RCCEINCLUDE)/RCCE.h RCCE_pwr_wq.h 
+	$(CCOMPILE) -c $(CFLAGS) RCCE_eco_q.c  
+
+RCCE_pwr_wq_framework.o: RCCE_pwr_wq_framework.c $(RCCEINCLUDE)/RCCE.h RCCE_pwr_wq.h
+	$(CCOMPILE) -c $(CFLAGS) RCCE_pwr_wq_framework.c  
+
+clean:
+	@ rm -f	*.o wq eco_q FV_reset
+
+
+

+ 332 - 0
RCCE_V2.0/apps/ECOQ/.svn/text-base/RCCE_eco_q.c.svn-base

@@ -0,0 +1,332 @@
+/* this synthetic application assumes a three-dimensional
+   domain of  nx*ny*nz points that is decomposed into chunks
+   of different size, and that require different amounts
+   of computational work.
+*/
+ 
+#include "RCCE.h"
+#include "RCCE_pwr_wq.h"
+#include <stdio.h>
+ 
+#define min(x,y) ( (x) < (y) ? (x) : (y) )
+#define max(x,y) ( (x) > (y) ? (x) : (y) )
+ 
+int power_change = 1;
+int BASE_F = 5; /* baseline clock divider (320 MHz) */
+int HIGH_F = 3; /* high CPU clock divider (533 MHz) */
+void read_and_prep_data(int, int, int, int, double*);
+void do_work(int, int, int, int, int, int, int, 
+             double*, double*, double*, double*, RCCE_REQUEST*);
+ 
+#define NX     200
+#define NY     200
+#define NZ     100
+#define NCOMP  5
+#define NITER  10
+#define XZONEJAGS 4
+#define YZONEJAGS 4
+#define STEP 3
+ 
+typedef struct {
+  struct {
+    int seq_number;
+  } dynamic_part;
+  int npx;
+  int npy;
+  int kstart;
+  int kend;
+  int kwidth;
+  int left;
+  int right;
+  int *isize;
+  int *jsize;
+  int *ksize;
+  RCCE_REQUEST *request;
+} WORK_ITEM;
+ 
+int RCCE_WI_size(void *work_item) {
+  return(sizeof(((WORK_ITEM *)work_item)->dynamic_part));
+}
+ 
+int RCCE_WI_valid(void *work_item) {
+  return(((WORK_ITEM *)work_item)->dynamic_part.seq_number>=0);
+}
+ 
+void *RCCE_WI_address(void *work_item) {
+  return((void *)(&(((WORK_ITEM *)work_item)->dynamic_part)));
+}
+int RCCE_APP(int argc, char **argv){
+ 
+  int       *isize, *jsize, *ksize; 
+  int       ID, NP;
+  int       npx, npy, ix, iy, kstart, kend, kwidth, nrounds;
+  int       i, j, k, mem, ue, iter, fdiv, vlevel;
+  int       *team_member, team_size, team_lead, size, local_rank, 
+      left, right, master, master_number, *master_list;
+  QUEUE_PARMS wq_pars;
+  WORK_ITEM work_item;
+  RCCE_REQUEST request;
+ 
+  RCCE_init(&argc, &argv);
+  NP = wq_pars.NP = RCCE_num_ues();
+  ID = wq_pars.ID = RCCE_ue();
+ 
+  if (argc < 4) {
+    if (ID==0) printf("Error: Need two parameters, x & y tiles, plus # rounds\n");
+    return(1);
+  }
+ 
+/* read the number of subdomains (x & y-direction) from the command line        */
+  npx = work_item.npx = atoi(*++argv);
+  npy = work_item.npy =  atoi(*++argv);
+ 
+/* test validity of the requested tiling; each tile must be large enough to
+   divide the z-dimension amoung the members of the team                       */
+  if (npx <= 0 || npy <= 0 || npx > NX || npy > NY) {
+    if (ID==0) printf("Illegal tiling: %d, %d\n", npx, npy);
+    RCCE_finalize();
+    return(1);
+  }
+  nrounds = atoi(*++argv);
+  if (nrounds <= 0) {power_change=0; nrounds = -nrounds;}
+  
+  RCCE_debug_set(RCCE_DEBUG_ALL);
+  /* lower power req until we need it                                           */
+  if (power_change) RCCE_iset_power(BASE_F, &request, &fdiv, &vlevel);
+ 
+  /* form teams; copy results to local variables                                */
+  RCCE_setup_work_queue_teams(&wq_pars); 
+  master      = wq_pars.master;
+  team_lead   = wq_pars.team_lead;
+  local_rank  = wq_pars.local_rank;
+  team_size   = wq_pars.team_size;
+  team_member = wq_pars.team_member;
+  master_list = wq_pars.master_list;  
+ 
+  if (team_size > NZ) {
+    if (ID==0) printf("Error: NZ too small: %d\n", NZ);
+    RCCE_finalize();
+    return(1);
+  }
+ 
+  /* define left and right neighbors                                            */
+  if (local_rank>0)           work_item.left  = team_member[local_rank-1];
+  else                        work_item.left  = -1;
+  if (local_rank<team_size-1) work_item.right = team_member[local_rank+1];
+  else                        work_item.right = -1;
+ 
+  if (ID != master) {
+    /* allocate space for the sizes of the subdomains                           */
+    isize = (int *) malloc(sizeof(int)*npx);
+    jsize = (int *) malloc(sizeof(int)*npy);  
+    ksize = (int *) malloc(sizeof(int)*team_size);
+    if (!isize || !jsize || !ksize) {
+      printf("Could not allocate space for tile sizes\n");
+      return(1);
+    }
+ 
+    for (k=0; k<team_size; k++) {
+      ksize[k] = NZ/team_size;
+      /* adjust for any leftover points                                         */
+      if (k<(NZ%team_size)) ksize[k]++;
+    }
+    for (kstart=0, k=0; k<local_rank; k++) kstart += ksize[k];
+    kend = kstart + ksize[local_rank] -1;
+    kwidth = work_item.kwidth = ksize[local_rank]+2;
+    work_item.kstart = kstart;
+    work_item.kend   = kend;
+ 
+    /* introduce load imbalance among subdomains by perturbing their sizes      */
+    for (i=0; i<npx-1; i++) isize[i] = NX/npx;
+    isize[npx-1] = NX-(NX/npx)*(npx-1);
+    for (iter=0; iter<XZONEJAGS; iter++) 
+    for (i=1; i<npx; i+=2) if (isize[i-1] > i) {
+      isize[i-1] -= i;
+      isize[i]   += i;
+    }
+    for (j=0; j<npy-1; j++) jsize[j] = NY/npy;
+    jsize[npy-1] = NY-(NY/npy)*(npy-1);
+    for (iter=0; iter<YZONEJAGS; iter++) 
+    for (j=1; j<npy; j+=2) if (jsize[j-1] > j) {
+      jsize[j-1] -= j;
+      jsize[j]   += j;
+    }
+  }
+ 
+  work_item.dynamic_part.seq_number = 0;
+  work_item.request = &request;
+  work_item.isize = isize;
+  work_item.jsize = jsize;
+  work_item.ksize = ksize;
+ 
+  WORK_ITEM *wi = &work_item;
+ 
+/* master goes into a loop, servicing work requests                             */
+  if (ID==master) {
+    int tasks_completed = 0;
+    while (tasks_completed<nrounds) {
+      tasks_completed += RCCE_queue_master_loop((void *)&work_item, &wq_pars);
+    }
+    /* master creates one more work loop to end all teams                       */
+    work_item.dynamic_part.seq_number = -1;
+    RCCE_queue_master_loop((void *)&work_item, &wq_pars);
+  }
+ 
+/* teams go into an endless loop, executing tasks and asking for new 
+   ones when they are done                                                      */
+ 
+  else {
+    int error = 0;
+    while (!error) {
+      error=RCCE_queue_member_loop((void *)(&work_item), &wq_pars);
+    }
+  }
+  
+  RCCE_finalize();
+  return (0);
+}
+ 
+int RCCE_execute_work_item(void *work_item, QUEUE_PARMS *wq_pars) {
+ 
+  int ix, iy, words, fdiv, vlevel;
+  double *data_frame, *flux_x, *flux_y, *flux_z;
+  WORK_ITEM *wi;
+  wi = (WORK_ITEM *)work_item;
+    
+  ix = (wi->dynamic_part.seq_number)%(wi->npx);
+  iy = (wi->dynamic_part.seq_number)/(wi->npx);
+  words = wi->isize[ix]*wi->jsize[iy]*(wi->kwidth)*NCOMP;
+  data_frame = (double *) malloc(4*words*sizeof(double));
+  if (!data_frame) {
+    printf("Could not allocate %d words on UE %d\n", words, RCCE_ue());
+    return(1);
+  }
+  flux_x = data_frame + 1*words;
+  flux_y = data_frame + 2*words;
+  flux_z = data_frame + 3*words;
+  read_and_prep_data(wi->isize[ix], wi->jsize[iy], wi->kstart, wi->kend, data_frame);
+  /* entering a high-cpu-intensity segment of the code  */
+  if (power_change) RCCE_wait_power(wi->request);
+  if (power_change) RCCE_iset_power(HIGH_F, wi->request, &fdiv, &vlevel);
+  do_work(wi->isize[ix], wi->jsize[iy], wi->kstart, wi->kend, wi->left, wi->right, 
+          wq_pars->local_rank, data_frame, flux_x, flux_y, flux_z, wi->request);
+  free(data_frame);
+  return(0);
+}
+ 
+ 
+#define FR(c,i,j,k) data_frame[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
+ 
+void read_and_prep_data(int in, int jn, int kstart, int kend, double *data_frame) {
+  int i, j, k, c;
+ 
+  /* initialize with smooth data */
+  for (k=kstart; k<=kend; k++) for (j=0; j<jn; j++) for (i=0; i<in; i++) {
+    FR(0,i,j,k) = 1.0;
+    FR(1,i,j,k) = (double)(k-j)+10.0;
+    FR(2,i,j,k) = (double)(i-k)+20.0;
+    FR(3,i,j,k) = (double)(j-i)+30.0;
+    FR(4,i,j,k) = 100.0;
+  }
+ 
+  /* add jaggedness */
+  for (k=kstart; k<=kend; k++) {
+    for (j=0; j<jn; j+=2) {
+      for (i=0; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) -= 1.0;
+      for (i=1; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) += 1.0;
+    }
+    for (j=1; j<jn; j+=2) {
+      for (i=0; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) -= 1.0;
+      for (i=1; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) += 1.0;
+    }
+  }
+  return;
+}
+ 
+#define FLUX_X(c,i,j,k) flux_x[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
+#define FLUX_Y(c,i,j,k) flux_y[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
+#define FLUX_Z(c,i,j,k) flux_z[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
+ 
+void do_work(int in, int jn, int kstart, int kend, int left, int right, int rank,
+             double *data_frame, double *flux_x, double *flux_y, double *flux_z,
+             RCCE_REQUEST *request) {
+ 
+  int i, j, k, c, iter, phase, fdiv, vlevel;
+  double vx = 1.0, vy = 1.0, vz = 1.0;
+  double dt = 0.0001;
+  double mu = 1.0;
+ 
+  for (iter=0; iter<NITER; iter++) {
+ 
+    if (iter==2 && power_change) {
+      RCCE_wait_power(request);
+    }
+    if (iter==NITER-2 & power_change) {
+      RCCE_iset_power(BASE_F, request, &fdiv, &vlevel);
+    }
+    /* before each iteration we need to fill ghost points with neighbor data */
+    for (phase=0; phase<2; phase++) {
+      if (right != -1 && (rank+phase+1)%2) {
+         RCCE_send((char *)(&FR(0,0,0,kend)),in*jn*NCOMP*sizeof(double), right);
+      }
+      if (left  != -1 && (rank+phase)%2) {
+         RCCE_recv((char *)(&FR(0,0,0,kstart-1)),in*jn*NCOMP*sizeof(double), left);
+      }
+    }
+    for (phase=0; phase<2; phase++) {
+      if (left != -1 && (rank+phase+1)%2)
+         RCCE_send((char *)(&FR(0,0,0,kstart)),in*jn*NCOMP*sizeof(double), left);
+      if (right  != -1 && (rank+phase)%2) 
+         RCCE_recv((char *)(&FR(0,0,0,kend+1)),in*jn*NCOMP*sizeof(double), right);
+    }
+    for (k=max(kstart,1); k<=min(NZ-2,kend); k++) for (j=1; j<jn-1; j++) 
+    for (i=1; i<in-1; i++) 
+    for (c=0; c<NCOMP; c++){
+      FLUX_X(c,i,j,k) = 
+        (3.0*FR(c,i+1,j+1,k  ) - 4.0*FR(c,i,j+1,k  ) + FR(c,i-1,j+1,k  ))/16.0 +
+        (3.0*FR(c,i+1,j  ,k+1) - 4.0*FR(c,i,j,  k+1) + FR(c,i-1,j,  k+1))/16.0 +
+        (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i,j+1,k+1) + FR(c,i-1,j+1,k+1))/32.0 +
+        (3.0*FR(c,i+1,j-1,k  ) - 4.0*FR(c,i,j-1,k  ) + FR(c,i-1,j-1,k  ))/16.0 +
+        (3.0*FR(c,i+1,j  ,k-1) - 4.0*FR(c,i,j,  k-1) + FR(c,i-1,j,  k-1))/16.0 +
+        (3.0*FR(c,i+1,j-1,k-1) - 4.0*FR(c,i,j-1,k-1) + FR(c,i-1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i+1,j-1,k+1) - 4.0*FR(c,i,j-1,k+1) + FR(c,i-1,j-1,k+1))/32.0 +
+        (3.0*FR(c,i+1,j+1,k-1) - 4.0*FR(c,i,j+1,k-1) + FR(c,i-1,j+1,k-1))/32.0 +
+        (3.0*FR(c,i+1,j  ,k  ) - 4.0*FR(c,i,j,  k  ) + FR(c,i-1,j,  k  ))/8.0;
+  
+      FLUX_Y(c,i,j,k) = 
+        (3.0*FR(c,i+1,j+1,k  ) - 4.0*FR(c,i+1,j,k  ) + FR(c,i+1,j-1,k  ))/16.0 +
+        (3.0*FR(c,i  ,j+1,k+1) - 4.0*FR(c,i  ,j,k+1) + FR(c,i  ,j-1,k+1))/16.0 +
+        (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i+1,j,k+1) + FR(c,i+1,j-1,k+1))/32.0 +
+        (3.0*FR(c,i-1,j+1,k  ) - 4.0*FR(c,i-1,j,k  ) + FR(c,i-1,j-1,k  ))/16.0 +
+        (3.0*FR(c,i  ,j+1,k-1) - 4.0*FR(c,i  ,j,k-1) + FR(c,i  ,j-1,k-1))/16.0 +
+        (3.0*FR(c,i-1,j+1,k-1) - 4.0*FR(c,i-1,j,k-1) + FR(c,i-1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i-1,j+1,k+1) - 4.0*FR(c,i-1,j,k+1) + FR(c,i-1,j-1,k+1))/32.0 +
+        (3.0*FR(c,i+1,j+1,k-1) - 4.0*FR(c,i+1,j,k-1) + FR(c,i+1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i  ,j+1,k  ) - 4.0*FR(c,i  ,j,k  ) + FR(c,i  ,j-1,k  ))/8.0;
+  
+      FLUX_Y(c,i,j,k) = 
+        (3.0*FR(c,i+1,j  ,k+1) - 4.0*FR(c,i+1,j  ,k) + FR(c,i+1,j  ,k-1))/16.0 +
+        (3.0*FR(c,i  ,j+1,k+1) - 4.0*FR(c,i  ,j+1,k) + FR(c,i  ,j+1,k-1))/16.0 +
+        (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i+1,j+1,k) + FR(c,i+1,j+1,k-1))/32.0 +
+        (3.0*FR(c,i-1,j  ,k+1) - 4.0*FR(c,i-1,j  ,k) + FR(c,i-1,j  ,k-1))/16.0 +
+        (3.0*FR(c,i  ,j-1,k+1) - 4.0*FR(c,i  ,j-1,k) + FR(c,i  ,j-1,k-1))/16.0 +
+        (3.0*FR(c,i-1,j-1,k+1) - 4.0*FR(c,i-1,j-1,k) + FR(c,i-1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i-1,j+1,k+1) - 4.0*FR(c,i-1,j+1,k) + FR(c,i-1,j+1,k-1))/32.0 +
+        (3.0*FR(c,i+1,j-1,k+1) - 4.0*FR(c,i+1,j-1,k) + FR(c,i+1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i  ,j  ,k+1) - 4.0*FR(c,i  ,j  ,k) + FR(c,i  ,j  ,k-1))/8.0;
+  
+      FR(c,i,j,k) += dt*(
+         -1.0*(vx*FLUX_X(c,i,j,k) + vy*FLUX_Y(c,i,j,k) + vz*FLUX_Z(c,i,j,k)) + 
+              FR(c,i+1,j,k) -2.0*FR(c,i,j,k) + FR(c,i-1,j,k) +
+              FR(c,i,j+1,k) -2.0*FR(c,i,j,k) + FR(c,i,j-1,k) +
+              FR(c,i,j,k+1) -2.0*FR(c,i,j,k) + FR(c,i,j,k-1));
+    }
+  }
+  return;
+}
+ 
+int RCCE_new_work_item(void *work_item, QUEUE_PARMS *wq_pars) {
+  WORK_ITEM *wi = (WORK_ITEM *)work_item;
+  wi->dynamic_part.seq_number = (wi->dynamic_part.seq_number+1)%(wi->npx*wi->npy);
+  return(RCCE_SUCCESS);
+}

+ 23 - 0
RCCE_V2.0/apps/ECOQ/.svn/text-base/RCCE_pwr_wq.h.svn-base

@@ -0,0 +1,23 @@
+typedef struct {
+  int NP;
+  int ID;
+  int master;
+  int team_lead;
+  int local_rank;
+  int team_size;
+  int team_member[RCCE_MAXNP];
+  int master_list[RCCE_MAXNP];
+  int master_number;
+} QUEUE_PARMS;
+ 
+int RCCE_execute_work_item(void *, QUEUE_PARMS *);
+int RCCE_setup_work_queue_teams(QUEUE_PARMS *);
+int RCCE_queue_master_loop(void *, QUEUE_PARMS *);
+int RCCE_new_work_item(void *, QUEUE_PARMS *);
+int RCCE_queue_member_loop(void *, QUEUE_PARMS *);
+int RCCE_WI_size(void *);
+void *RCCE_WI_address(void *);
+ 
+#ifdef OPENMP_
+#pragma omp threadprivate(power_change)
+#endif

+ 156 - 0
RCCE_V2.0/apps/ECOQ/.svn/text-base/RCCE_pwr_wq_framework.c.svn-base

@@ -0,0 +1,156 @@
+#include "RCCE.h"
+#include "RCCE_pwr_wq.h"
+#include <stdio.h>
+ 
+int RCCE_WI_valid(void *);
+int  RCCE_qsort(char *, size_t, size_t, int (*)(const void*, const void*));
+/* comparison function used in routine to sort core IDs                  */
+int id_compare(const void *e1, const void *e2);
+ 
+int RCCE_setup_work_queue_teams(QUEUE_PARMS *wq_pars){ 
+ 
+  int NP, ID, ue, size, mem, master, team_lead, team_size, local_rank;
+  int test, isleader;
+  int *team_member, *master_list;
+ 
+  NP = wq_pars->NP = RCCE_num_ues();
+  ID = wq_pars->ID = RCCE_ue();
+  team_member = wq_pars->team_member;
+  master_list = wq_pars->master_list;
+ 
+/* determine the number of UEs in the local power domain and form teams         */
+  wq_pars->team_size = team_size = RCCE_power_domain_size();
+  wq_pars->team_lead = team_lead = RCCE_power_domain_master();
+  if (team_lead == ID) {
+    /* the team lead is the first team member                                   */
+    team_member[0] = team_lead;
+    size = 1;
+    /* the team leads collects IDs from its team members ...                    */
+    while (size<team_size) for (ue=0; ue<NP; ue++) if (ue != team_lead) {
+      RCCE_recv_test((char *)(&(team_member[size])), sizeof(int), ue, &test);
+      if (test) team_member[size++] = ue;
+    }
+    /* ... and sends the list to all other team members, after sorting it       */
+    RCCE_qsort((char *)team_member, team_size, sizeof(int), id_compare);
+    for (ue=1; ue<team_size; ue++) 
+      RCCE_send((char *)team_member, team_size*sizeof(int), team_member[ue]);
+  }
+  else {
+    /* team members check in with the team lead ...                             */
+    RCCE_send((char *)(&ID), sizeof(int), team_lead);
+    /* ... and receive the complete list of team members                        */
+    RCCE_recv((char *)team_member, team_size*sizeof(int), team_lead);
+  }
+ 
+  /* we assign the UE with the highest rank the role of master. We know that
+     this UE is either in a power domain by itself, or there is another UE  
+     in the same power domain who is the power domain master, because the
+     power domain master is always the UE in that domain with the lowest rank   */
+  master = wq_pars->master = NP-1;
+ 
+/* the team containing the overall master must remove it from its member list   */
+  if (team_member[team_size-1] == master) wq_pars->team_size = --team_size;
+ 
+  /* the overall master is not in any team                                      */
+  if (ID==master) team_size = wq_pars->team_size = 0;
+ 
+/* each UE determines its rank within the team                                  */
+  local_rank = wq_pars->local_rank = 0;
+  for (ue=0; ue<team_size; ue++) if (ID==team_member[ue]) 
+    local_rank = wq_pars->local_rank = ue;
+ 
+/* this code determines number of power domain leads, plus list of UEs          */
+  if (ID == master) {
+    wq_pars->master_number = 0;
+    for (int ue=0; ue<RCCE_num_ues()-1; ue++) {
+      /* ask each core whether it is a team lead or not                         */
+      RCCE_recv((char *)(&isleader), sizeof(int), ue);
+      if (isleader) {
+        master_list[wq_pars->master_number] = ue;
+        (wq_pars->master_number)++;
+      }
+    }
+  }
+  else {
+    /* all cores let the master know their team lead status                     */
+    isleader = (ID == team_lead);
+    RCCE_send((char *)(&isleader), sizeof(int), master);
+  }
+ 
+/* all UEs report their team size and memberships                               */
+//  for (ue=0; ue<NP; ue++) {
+//    RCCE_barrier(&RCCE_COMM_WORLD);
+//    if (ID==ue) {
+//      printf("UE %d (%d) is in a team with %d members: ", ID, 
+//             local_rank, team_size);
+//      for (mem=0; mem<team_size; mem++) printf("%d ", team_member[mem]);
+//      printf("\n");
+//    }
+//  }
+  return (RCCE_SUCCESS);
+}
+ 
+int RCCE_queue_master_loop(void *work_item, QUEUE_PARMS *wq_pars){
+ 
+  int ue, ignore, test, count;
+ 
+  int size = RCCE_WI_size(work_item);
+  void *address = RCCE_WI_address(work_item);
+  count = 0; 
+ 
+  if (RCCE_WI_valid(work_item)) {
+ 
+    /* service work requests from any UE; first come, first served                */
+    for (ue=0; ue<wq_pars->master_number; ue++) {
+      RCCE_recv_test((char *)(&ignore), sizeof(int), wq_pars->master_list[ue], &test);
+      if (test) {
+//        printf("Master sends work to UE %d\n", wq_pars->master_list[ue]);
+        RCCE_send((char *)address, size, wq_pars->master_list[ue]);
+        count++;
+        /* generate the next work item                                            */
+        RCCE_new_work_item(work_item, wq_pars);
+      }
+    }
+  }
+  else {
+    /*  this loop ends all teams, so must insist each team checks in              */
+    for (ue=0; ue<wq_pars->master_number; ue++) {
+      RCCE_recv((char *)(&ignore), sizeof(int), wq_pars->master_list[ue]);
+//      printf("Master sends end of work message to UE %d\n", ue);
+      RCCE_send((char *)address, size,  wq_pars->master_list[ue]);
+    }
+  }
+    
+  return(count);
+}
+ 
+int RCCE_queue_member_loop(void *work_item, QUEUE_PARMS *wq_pars) {
+ 
+  int gimme_work, mem;
+  int size = RCCE_WI_size(work_item);
+  void *address = RCCE_WI_address(work_item);
+ 
+  /* ask for work if I am a team lead                                          */
+  if (wq_pars->ID == wq_pars->team_lead) {
+    RCCE_send((char *)(&gimme_work), sizeof(int), wq_pars->master);
+    RCCE_recv((char *)address, size, wq_pars->master);
+    /* team leads parcel out the work to the workers */
+    for (mem=1; mem<(wq_pars->team_size); mem++) {
+        printf("Team lead %d sends work to UE %d\n", RCCE_ue(), wq_pars->team_member[mem]);
+        fflush(0);
+      RCCE_send((char *)address, size, wq_pars->team_member[mem]);
+    }
+  }
+  else {
+    RCCE_recv((char *)address, size, wq_pars->team_lead);
+  }
+  if (RCCE_WI_valid(work_item)) {
+    RCCE_execute_work_item(work_item, wq_pars);
+//    printf("UE %d executed work item\n", wq_pars->ID);
+  }
+  else {
+//    printf("UE %d received stop queue task\n", RCCE_ue());
+    return(1);
+  }
+  return(RCCE_SUCCESS);
+}

+ 2 - 0
RCCE_V2.0/apps/ECOQ/.svn/text-base/README.svn-base

@@ -0,0 +1,2 @@
+Please note that ECOQ is still under development and may be unstable.
+

+ 24 - 0
RCCE_V2.0/apps/ECOQ/Makefile

@@ -0,0 +1,24 @@
+SHELL=sh
+
+RCCEROOT=../..
+include $(RCCEROOT)/common/symbols
+
+ECOQOBJS=RCCE_eco_q.o RCCE_pwr_wq_framework.o $(ARCHIVE)
+
+default:
+	@echo Usage: make PWRMGMT=1 eco_q [clean]
+
+eco_q: $(ECOQOBJS)
+	$(CCOMPILE) -o eco_q $(ECOQOBJS) $(CFLAGS)
+
+RCCE_eco_q.o: RCCE_eco_q.c $(RCCEINCLUDE)/RCCE.h RCCE_pwr_wq.h 
+	$(CCOMPILE) -c $(CFLAGS) RCCE_eco_q.c  
+
+RCCE_pwr_wq_framework.o: RCCE_pwr_wq_framework.c $(RCCEINCLUDE)/RCCE.h RCCE_pwr_wq.h
+	$(CCOMPILE) -c $(CFLAGS) RCCE_pwr_wq_framework.c  
+
+clean:
+	@ rm -f	*.o wq eco_q FV_reset
+
+
+

+ 332 - 0
RCCE_V2.0/apps/ECOQ/RCCE_eco_q.c

@@ -0,0 +1,332 @@
+/* this synthetic application assumes a three-dimensional
+   domain of  nx*ny*nz points that is decomposed into chunks
+   of different size, and that require different amounts
+   of computational work.
+*/
+ 
+#include "RCCE.h"
+#include "RCCE_pwr_wq.h"
+#include <stdio.h>
+ 
+#define min(x,y) ( (x) < (y) ? (x) : (y) )
+#define max(x,y) ( (x) > (y) ? (x) : (y) )
+ 
+int power_change = 1;
+int BASE_F = 5; /* baseline clock divider (320 MHz) */
+int HIGH_F = 3; /* high CPU clock divider (533 MHz) */
+void read_and_prep_data(int, int, int, int, double*);
+void do_work(int, int, int, int, int, int, int, 
+             double*, double*, double*, double*, RCCE_REQUEST*);
+ 
+#define NX     200
+#define NY     200
+#define NZ     100
+#define NCOMP  5
+#define NITER  10
+#define XZONEJAGS 4
+#define YZONEJAGS 4
+#define STEP 3
+ 
+typedef struct {
+  struct {
+    int seq_number;
+  } dynamic_part;
+  int npx;
+  int npy;
+  int kstart;
+  int kend;
+  int kwidth;
+  int left;
+  int right;
+  int *isize;
+  int *jsize;
+  int *ksize;
+  RCCE_REQUEST *request;
+} WORK_ITEM;
+ 
+int RCCE_WI_size(void *work_item) {
+  return(sizeof(((WORK_ITEM *)work_item)->dynamic_part));
+}
+ 
+int RCCE_WI_valid(void *work_item) {
+  return(((WORK_ITEM *)work_item)->dynamic_part.seq_number>=0);
+}
+ 
+void *RCCE_WI_address(void *work_item) {
+  return((void *)(&(((WORK_ITEM *)work_item)->dynamic_part)));
+}
+int RCCE_APP(int argc, char **argv){
+ 
+  int       *isize, *jsize, *ksize; 
+  int       ID, NP;
+  int       npx, npy, ix, iy, kstart, kend, kwidth, nrounds;
+  int       i, j, k, mem, ue, iter, fdiv, vlevel;
+  int       *team_member, team_size, team_lead, size, local_rank, 
+      left, right, master, master_number, *master_list;
+  QUEUE_PARMS wq_pars;
+  WORK_ITEM work_item;
+  RCCE_REQUEST request;
+ 
+  RCCE_init(&argc, &argv);
+  NP = wq_pars.NP = RCCE_num_ues();
+  ID = wq_pars.ID = RCCE_ue();
+ 
+  if (argc < 4) {
+    if (ID==0) printf("Error: Need two parameters, x & y tiles, plus # rounds\n");
+    return(1);
+  }
+ 
+/* read the number of subdomains (x & y-direction) from the command line        */
+  npx = work_item.npx = atoi(*++argv);
+  npy = work_item.npy =  atoi(*++argv);
+ 
+/* test validity of the requested tiling; each tile must be large enough to
+   divide the z-dimension amoung the members of the team                       */
+  if (npx <= 0 || npy <= 0 || npx > NX || npy > NY) {
+    if (ID==0) printf("Illegal tiling: %d, %d\n", npx, npy);
+    RCCE_finalize();
+    return(1);
+  }
+  nrounds = atoi(*++argv);
+  if (nrounds <= 0) {power_change=0; nrounds = -nrounds;}
+  
+  RCCE_debug_set(RCCE_DEBUG_ALL);
+  /* lower power req until we need it                                           */
+  if (power_change) RCCE_iset_power(BASE_F, &request, &fdiv, &vlevel);
+ 
+  /* form teams; copy results to local variables                                */
+  RCCE_setup_work_queue_teams(&wq_pars); 
+  master      = wq_pars.master;
+  team_lead   = wq_pars.team_lead;
+  local_rank  = wq_pars.local_rank;
+  team_size   = wq_pars.team_size;
+  team_member = wq_pars.team_member;
+  master_list = wq_pars.master_list;  
+ 
+  if (team_size > NZ) {
+    if (ID==0) printf("Error: NZ too small: %d\n", NZ);
+    RCCE_finalize();
+    return(1);
+  }
+ 
+  /* define left and right neighbors                                            */
+  if (local_rank>0)           work_item.left  = team_member[local_rank-1];
+  else                        work_item.left  = -1;
+  if (local_rank<team_size-1) work_item.right = team_member[local_rank+1];
+  else                        work_item.right = -1;
+ 
+  if (ID != master) {
+    /* allocate space for the sizes of the subdomains                           */
+    isize = (int *) malloc(sizeof(int)*npx);
+    jsize = (int *) malloc(sizeof(int)*npy);  
+    ksize = (int *) malloc(sizeof(int)*team_size);
+    if (!isize || !jsize || !ksize) {
+      printf("Could not allocate space for tile sizes\n");
+      return(1);
+    }
+ 
+    for (k=0; k<team_size; k++) {
+      ksize[k] = NZ/team_size;
+      /* adjust for any leftover points                                         */
+      if (k<(NZ%team_size)) ksize[k]++;
+    }
+    for (kstart=0, k=0; k<local_rank; k++) kstart += ksize[k];
+    kend = kstart + ksize[local_rank] -1;
+    kwidth = work_item.kwidth = ksize[local_rank]+2;
+    work_item.kstart = kstart;
+    work_item.kend   = kend;
+ 
+    /* introduce load imbalance among subdomains by perturbing their sizes      */
+    for (i=0; i<npx-1; i++) isize[i] = NX/npx;
+    isize[npx-1] = NX-(NX/npx)*(npx-1);
+    for (iter=0; iter<XZONEJAGS; iter++) 
+    for (i=1; i<npx; i+=2) if (isize[i-1] > i) {
+      isize[i-1] -= i;
+      isize[i]   += i;
+    }
+    for (j=0; j<npy-1; j++) jsize[j] = NY/npy;
+    jsize[npy-1] = NY-(NY/npy)*(npy-1);
+    for (iter=0; iter<YZONEJAGS; iter++) 
+    for (j=1; j<npy; j+=2) if (jsize[j-1] > j) {
+      jsize[j-1] -= j;
+      jsize[j]   += j;
+    }
+  }
+ 
+  work_item.dynamic_part.seq_number = 0;
+  work_item.request = &request;
+  work_item.isize = isize;
+  work_item.jsize = jsize;
+  work_item.ksize = ksize;
+ 
+  WORK_ITEM *wi = &work_item;
+ 
+/* master goes into a loop, servicing work requests                             */
+  if (ID==master) {
+    int tasks_completed = 0;
+    while (tasks_completed<nrounds) {
+      tasks_completed += RCCE_queue_master_loop((void *)&work_item, &wq_pars);
+    }
+    /* master creates one more work loop to end all teams                       */
+    work_item.dynamic_part.seq_number = -1;
+    RCCE_queue_master_loop((void *)&work_item, &wq_pars);
+  }
+ 
+/* teams go into an endless loop, executing tasks and asking for new 
+   ones when they are done                                                      */
+ 
+  else {
+    int error = 0;
+    while (!error) {
+      error=RCCE_queue_member_loop((void *)(&work_item), &wq_pars);
+    }
+  }
+  
+  RCCE_finalize();
+  return (0);
+}
+ 
+int RCCE_execute_work_item(void *work_item, QUEUE_PARMS *wq_pars) {
+ 
+  int ix, iy, words, fdiv, vlevel;
+  double *data_frame, *flux_x, *flux_y, *flux_z;
+  WORK_ITEM *wi;
+  wi = (WORK_ITEM *)work_item;
+    
+  ix = (wi->dynamic_part.seq_number)%(wi->npx);
+  iy = (wi->dynamic_part.seq_number)/(wi->npx);
+  words = wi->isize[ix]*wi->jsize[iy]*(wi->kwidth)*NCOMP;
+  data_frame = (double *) malloc(4*words*sizeof(double));
+  if (!data_frame) {
+    printf("Could not allocate %d words on UE %d\n", words, RCCE_ue());
+    return(1);
+  }
+  flux_x = data_frame + 1*words;
+  flux_y = data_frame + 2*words;
+  flux_z = data_frame + 3*words;
+  read_and_prep_data(wi->isize[ix], wi->jsize[iy], wi->kstart, wi->kend, data_frame);
+  /* entering a high-cpu-intensity segment of the code  */
+  if (power_change) RCCE_wait_power(wi->request);
+  if (power_change) RCCE_iset_power(HIGH_F, wi->request, &fdiv, &vlevel);
+  do_work(wi->isize[ix], wi->jsize[iy], wi->kstart, wi->kend, wi->left, wi->right, 
+          wq_pars->local_rank, data_frame, flux_x, flux_y, flux_z, wi->request);
+  free(data_frame);
+  return(0);
+}
+ 
+ 
+#define FR(c,i,j,k) data_frame[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
+ 
+void read_and_prep_data(int in, int jn, int kstart, int kend, double *data_frame) {
+  int i, j, k, c;
+ 
+  /* initialize with smooth data */
+  for (k=kstart; k<=kend; k++) for (j=0; j<jn; j++) for (i=0; i<in; i++) {
+    FR(0,i,j,k) = 1.0;
+    FR(1,i,j,k) = (double)(k-j)+10.0;
+    FR(2,i,j,k) = (double)(i-k)+20.0;
+    FR(3,i,j,k) = (double)(j-i)+30.0;
+    FR(4,i,j,k) = 100.0;
+  }
+ 
+  /* add jaggedness */
+  for (k=kstart; k<=kend; k++) {
+    for (j=0; j<jn; j+=2) {
+      for (i=0; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) -= 1.0;
+      for (i=1; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) += 1.0;
+    }
+    for (j=1; j<jn; j+=2) {
+      for (i=0; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) -= 1.0;
+      for (i=1; i<in; i+=2) for (c=0; c<NCOMP; c++) FR(c,i,j,k) += 1.0;
+    }
+  }
+  return;
+}
+ 
+#define FLUX_X(c,i,j,k) flux_x[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
+#define FLUX_Y(c,i,j,k) flux_y[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
+#define FLUX_Z(c,i,j,k) flux_z[(c)+NCOMP*((i)+in*((j)+(k-kstart+1)*jn))]
+ 
+void do_work(int in, int jn, int kstart, int kend, int left, int right, int rank,
+             double *data_frame, double *flux_x, double *flux_y, double *flux_z,
+             RCCE_REQUEST *request) {
+ 
+  int i, j, k, c, iter, phase, fdiv, vlevel;
+  double vx = 1.0, vy = 1.0, vz = 1.0;
+  double dt = 0.0001;
+  double mu = 1.0;
+ 
+  for (iter=0; iter<NITER; iter++) {
+ 
+    if (iter==2 && power_change) {
+      RCCE_wait_power(request);
+    }
+    if (iter==NITER-2 & power_change) {
+      RCCE_iset_power(BASE_F, request, &fdiv, &vlevel);
+    }
+    /* before each iteration we need to fill ghost points with neighbor data */
+    for (phase=0; phase<2; phase++) {
+      if (right != -1 && (rank+phase+1)%2) {
+         RCCE_send((char *)(&FR(0,0,0,kend)),in*jn*NCOMP*sizeof(double), right);
+      }
+      if (left  != -1 && (rank+phase)%2) {
+         RCCE_recv((char *)(&FR(0,0,0,kstart-1)),in*jn*NCOMP*sizeof(double), left);
+      }
+    }
+    for (phase=0; phase<2; phase++) {
+      if (left != -1 && (rank+phase+1)%2)
+         RCCE_send((char *)(&FR(0,0,0,kstart)),in*jn*NCOMP*sizeof(double), left);
+      if (right  != -1 && (rank+phase)%2) 
+         RCCE_recv((char *)(&FR(0,0,0,kend+1)),in*jn*NCOMP*sizeof(double), right);
+    }
+    for (k=max(kstart,1); k<=min(NZ-2,kend); k++) for (j=1; j<jn-1; j++) 
+    for (i=1; i<in-1; i++) 
+    for (c=0; c<NCOMP; c++){
+      FLUX_X(c,i,j,k) = 
+        (3.0*FR(c,i+1,j+1,k  ) - 4.0*FR(c,i,j+1,k  ) + FR(c,i-1,j+1,k  ))/16.0 +
+        (3.0*FR(c,i+1,j  ,k+1) - 4.0*FR(c,i,j,  k+1) + FR(c,i-1,j,  k+1))/16.0 +
+        (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i,j+1,k+1) + FR(c,i-1,j+1,k+1))/32.0 +
+        (3.0*FR(c,i+1,j-1,k  ) - 4.0*FR(c,i,j-1,k  ) + FR(c,i-1,j-1,k  ))/16.0 +
+        (3.0*FR(c,i+1,j  ,k-1) - 4.0*FR(c,i,j,  k-1) + FR(c,i-1,j,  k-1))/16.0 +
+        (3.0*FR(c,i+1,j-1,k-1) - 4.0*FR(c,i,j-1,k-1) + FR(c,i-1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i+1,j-1,k+1) - 4.0*FR(c,i,j-1,k+1) + FR(c,i-1,j-1,k+1))/32.0 +
+        (3.0*FR(c,i+1,j+1,k-1) - 4.0*FR(c,i,j+1,k-1) + FR(c,i-1,j+1,k-1))/32.0 +
+        (3.0*FR(c,i+1,j  ,k  ) - 4.0*FR(c,i,j,  k  ) + FR(c,i-1,j,  k  ))/8.0;
+  
+      FLUX_Y(c,i,j,k) = 
+        (3.0*FR(c,i+1,j+1,k  ) - 4.0*FR(c,i+1,j,k  ) + FR(c,i+1,j-1,k  ))/16.0 +
+        (3.0*FR(c,i  ,j+1,k+1) - 4.0*FR(c,i  ,j,k+1) + FR(c,i  ,j-1,k+1))/16.0 +
+        (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i+1,j,k+1) + FR(c,i+1,j-1,k+1))/32.0 +
+        (3.0*FR(c,i-1,j+1,k  ) - 4.0*FR(c,i-1,j,k  ) + FR(c,i-1,j-1,k  ))/16.0 +
+        (3.0*FR(c,i  ,j+1,k-1) - 4.0*FR(c,i  ,j,k-1) + FR(c,i  ,j-1,k-1))/16.0 +
+        (3.0*FR(c,i-1,j+1,k-1) - 4.0*FR(c,i-1,j,k-1) + FR(c,i-1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i-1,j+1,k+1) - 4.0*FR(c,i-1,j,k+1) + FR(c,i-1,j-1,k+1))/32.0 +
+        (3.0*FR(c,i+1,j+1,k-1) - 4.0*FR(c,i+1,j,k-1) + FR(c,i+1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i  ,j+1,k  ) - 4.0*FR(c,i  ,j,k  ) + FR(c,i  ,j-1,k  ))/8.0;
+  
+      FLUX_Y(c,i,j,k) = 
+        (3.0*FR(c,i+1,j  ,k+1) - 4.0*FR(c,i+1,j  ,k) + FR(c,i+1,j  ,k-1))/16.0 +
+        (3.0*FR(c,i  ,j+1,k+1) - 4.0*FR(c,i  ,j+1,k) + FR(c,i  ,j+1,k-1))/16.0 +
+        (3.0*FR(c,i+1,j+1,k+1) - 4.0*FR(c,i+1,j+1,k) + FR(c,i+1,j+1,k-1))/32.0 +
+        (3.0*FR(c,i-1,j  ,k+1) - 4.0*FR(c,i-1,j  ,k) + FR(c,i-1,j  ,k-1))/16.0 +
+        (3.0*FR(c,i  ,j-1,k+1) - 4.0*FR(c,i  ,j-1,k) + FR(c,i  ,j-1,k-1))/16.0 +
+        (3.0*FR(c,i-1,j-1,k+1) - 4.0*FR(c,i-1,j-1,k) + FR(c,i-1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i-1,j+1,k+1) - 4.0*FR(c,i-1,j+1,k) + FR(c,i-1,j+1,k-1))/32.0 +
+        (3.0*FR(c,i+1,j-1,k+1) - 4.0*FR(c,i+1,j-1,k) + FR(c,i+1,j-1,k-1))/32.0 +
+        (3.0*FR(c,i  ,j  ,k+1) - 4.0*FR(c,i  ,j  ,k) + FR(c,i  ,j  ,k-1))/8.0;
+  
+      FR(c,i,j,k) += dt*(
+         -1.0*(vx*FLUX_X(c,i,j,k) + vy*FLUX_Y(c,i,j,k) + vz*FLUX_Z(c,i,j,k)) + 
+              FR(c,i+1,j,k) -2.0*FR(c,i,j,k) + FR(c,i-1,j,k) +
+              FR(c,i,j+1,k) -2.0*FR(c,i,j,k) + FR(c,i,j-1,k) +
+              FR(c,i,j,k+1) -2.0*FR(c,i,j,k) + FR(c,i,j,k-1));
+    }
+  }
+  return;
+}
+ 
+int RCCE_new_work_item(void *work_item, QUEUE_PARMS *wq_pars) {
+  WORK_ITEM *wi = (WORK_ITEM *)work_item;
+  wi->dynamic_part.seq_number = (wi->dynamic_part.seq_number+1)%(wi->npx*wi->npy);
+  return(RCCE_SUCCESS);
+}

+ 23 - 0
RCCE_V2.0/apps/ECOQ/RCCE_pwr_wq.h

@@ -0,0 +1,23 @@
+typedef struct {
+  int NP;
+  int ID;
+  int master;
+  int team_lead;
+  int local_rank;
+  int team_size;
+  int team_member[RCCE_MAXNP];
+  int master_list[RCCE_MAXNP];
+  int master_number;
+} QUEUE_PARMS;
+ 
+int RCCE_execute_work_item(void *, QUEUE_PARMS *);
+int RCCE_setup_work_queue_teams(QUEUE_PARMS *);
+int RCCE_queue_master_loop(void *, QUEUE_PARMS *);
+int RCCE_new_work_item(void *, QUEUE_PARMS *);
+int RCCE_queue_member_loop(void *, QUEUE_PARMS *);
+int RCCE_WI_size(void *);
+void *RCCE_WI_address(void *);
+ 
+#ifdef OPENMP_
+#pragma omp threadprivate(power_change)
+#endif

+ 156 - 0
RCCE_V2.0/apps/ECOQ/RCCE_pwr_wq_framework.c

@@ -0,0 +1,156 @@
+#include "RCCE.h"
+#include "RCCE_pwr_wq.h"
+#include <stdio.h>
+ 
+int RCCE_WI_valid(void *);
+int  RCCE_qsort(char *, size_t, size_t, int (*)(const void*, const void*));
+/* comparison function used in routine to sort core IDs                  */
+int id_compare(const void *e1, const void *e2);
+ 
+int RCCE_setup_work_queue_teams(QUEUE_PARMS *wq_pars){ 
+ 
+  int NP, ID, ue, size, mem, master, team_lead, team_size, local_rank;
+  int test, isleader;
+  int *team_member, *master_list;
+ 
+  NP = wq_pars->NP = RCCE_num_ues();
+  ID = wq_pars->ID = RCCE_ue();
+  team_member = wq_pars->team_member;
+  master_list = wq_pars->master_list;
+ 
+/* determine the number of UEs in the local power domain and form teams         */
+  wq_pars->team_size = team_size = RCCE_power_domain_size();
+  wq_pars->team_lead = team_lead = RCCE_power_domain_master();
+  if (team_lead == ID) {
+    /* the team lead is the first team member                                   */
+    team_member[0] = team_lead;
+    size = 1;
+    /* the team leads collects IDs from its team members ...                    */
+    while (size<team_size) for (ue=0; ue<NP; ue++) if (ue != team_lead) {
+      RCCE_recv_test((char *)(&(team_member[size])), sizeof(int), ue, &test);
+      if (test) team_member[size++] = ue;
+    }
+    /* ... and sends the list to all other team members, after sorting it       */
+    RCCE_qsort((char *)team_member, team_size, sizeof(int), id_compare);
+    for (ue=1; ue<team_size; ue++) 
+      RCCE_send((char *)team_member, team_size*sizeof(int), team_member[ue]);
+  }
+  else {
+    /* team members check in with the team lead ...                             */
+    RCCE_send((char *)(&ID), sizeof(int), team_lead);
+    /* ... and receive the complete list of team members                        */
+    RCCE_recv((char *)team_member, team_size*sizeof(int), team_lead);
+  }
+ 
+  /* we assign the UE with the highest rank the role of master. We know that
+     this UE is either in a power domain by itself, or there is another UE  
+     in the same power domain who is the power domain master, because the
+     power domain master is always the UE in that domain with the lowest rank   */
+  master = wq_pars->master = NP-1;
+ 
+/* the team containing the overall master must remove it from its member list   */
+  if (team_member[team_size-1] == master) wq_pars->team_size = --team_size;
+ 
+  /* the overall master is not in any team                                      */
+  if (ID==master) team_size = wq_pars->team_size = 0;
+ 
+/* each UE determines its rank within the team                                  */
+  local_rank = wq_pars->local_rank = 0;
+  for (ue=0; ue<team_size; ue++) if (ID==team_member[ue]) 
+    local_rank = wq_pars->local_rank = ue;
+ 
+/* this code determines number of power domain leads, plus list of UEs          */
+  if (ID == master) {
+    wq_pars->master_number = 0;
+    for (int ue=0; ue<RCCE_num_ues()-1; ue++) {
+      /* ask each core whether it is a team lead or not                         */
+      RCCE_recv((char *)(&isleader), sizeof(int), ue);
+      if (isleader) {
+        master_list[wq_pars->master_number] = ue;
+        (wq_pars->master_number)++;
+      }
+    }
+  }
+  else {
+    /* all cores let the master know their team lead status                     */
+    isleader = (ID == team_lead);
+    RCCE_send((char *)(&isleader), sizeof(int), master);
+  }
+ 
+/* all UEs report their team size and memberships                               */
+//  for (ue=0; ue<NP; ue++) {
+//    RCCE_barrier(&RCCE_COMM_WORLD);
+//    if (ID==ue) {
+//      printf("UE %d (%d) is in a team with %d members: ", ID, 
+//             local_rank, team_size);
+//      for (mem=0; mem<team_size; mem++) printf("%d ", team_member[mem]);
+//      printf("\n");
+//    }
+//  }
+  return (RCCE_SUCCESS);
+}
+ 
+int RCCE_queue_master_loop(void *work_item, QUEUE_PARMS *wq_pars){
+ 
+  int ue, ignore, test, count;
+ 
+  int size = RCCE_WI_size(work_item);
+  void *address = RCCE_WI_address(work_item);
+  count = 0; 
+ 
+  if (RCCE_WI_valid(work_item)) {
+ 
+    /* service work requests from any UE; first come, first served                */
+    for (ue=0; ue<wq_pars->master_number; ue++) {
+      RCCE_recv_test((char *)(&ignore), sizeof(int), wq_pars->master_list[ue], &test);
+      if (test) {
+//        printf("Master sends work to UE %d\n", wq_pars->master_list[ue]);
+        RCCE_send((char *)address, size, wq_pars->master_list[ue]);
+        count++;
+        /* generate the next work item                                            */
+        RCCE_new_work_item(work_item, wq_pars);
+      }
+    }
+  }
+  else {
+    /*  this loop ends all teams, so must insist each team checks in              */
+    for (ue=0; ue<wq_pars->master_number; ue++) {
+      RCCE_recv((char *)(&ignore), sizeof(int), wq_pars->master_list[ue]);
+//      printf("Master sends end of work message to UE %d\n", ue);
+      RCCE_send((char *)address, size,  wq_pars->master_list[ue]);
+    }
+  }
+    
+  return(count);
+}
+ 
+int RCCE_queue_member_loop(void *work_item, QUEUE_PARMS *wq_pars) {
+ 
+  int gimme_work, mem;
+  int size = RCCE_WI_size(work_item);
+  void *address = RCCE_WI_address(work_item);
+ 
+  /* ask for work if I am a team lead                                          */
+  if (wq_pars->ID == wq_pars->team_lead) {
+    RCCE_send((char *)(&gimme_work), sizeof(int), wq_pars->master);
+    RCCE_recv((char *)address, size, wq_pars->master);
+    /* team leads parcel out the work to the workers */
+    for (mem=1; mem<(wq_pars->team_size); mem++) {
+        printf("Team lead %d sends work to UE %d\n", RCCE_ue(), wq_pars->team_member[mem]);
+        fflush(0);
+      RCCE_send((char *)address, size, wq_pars->team_member[mem]);
+    }
+  }
+  else {
+    RCCE_recv((char *)address, size, wq_pars->team_lead);
+  }
+  if (RCCE_WI_valid(work_item)) {
+    RCCE_execute_work_item(work_item, wq_pars);
+//    printf("UE %d executed work item\n", wq_pars->ID);
+  }
+  else {
+//    printf("UE %d received stop queue task\n", RCCE_ue());
+    return(1);
+  }
+  return(RCCE_SUCCESS);
+}

+ 2 - 0
RCCE_V2.0/apps/ECOQ/README

@@ -0,0 +1,2 @@
+Please note that ECOQ is still under development and may be unstable.
+

+ 17 - 0
RCCE_V2.0/apps/FLUSH/.svn/all-wcprops

@@ -0,0 +1,17 @@
+K 25
+svn:wc:ra_dav:version-url
+V 54
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/FLUSH
+END
+Makefile
+K 25
+svn:wc:ra_dav:version-url
+V 63
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/FLUSH/Makefile
+END
+RCCE_test_cacheable.c
+K 25
+svn:wc:ra_dav:version-url
+V 76
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/FLUSH/RCCE_test_cacheable.c
+END

+ 96 - 0
RCCE_V2.0/apps/FLUSH/.svn/entries

@@ -0,0 +1,96 @@
+10
+
+dir
+313
+http://marcbug.scc-dc.com/svn/repository/tags/RCCE_V2.0/apps/FLUSH
+http://marcbug.scc-dc.com/svn/repository
+
+
+
+2011-04-11T21:00:28.037293Z
+188
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+c924d837-3317-4ba4-8fbd-5f2da8699d51
+
+RCCE_test_cacheable.c
+file
+
+
+
+
+2012-10-27T13:42:38.864598Z
+86ae06d5fc3ecbb2cd5cdcd377528404
+2011-02-23T19:51:16.745747Z
+161
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2723
+
+Makefile
+file
+
+
+
+
+2012-10-27T13:42:38.864598Z
+983fdf6020504a7d448bba54f62689e4
+2011-04-11T21:00:28.037293Z
+188
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+440
+

+ 19 - 0
RCCE_V2.0/apps/FLUSH/.svn/text-base/Makefile.svn-base

@@ -0,0 +1,19 @@
+SHELL=sh
+RCCEROOT=../..
+include $(RCCEROOT)/common/symbols
+
+CACHEABLEOBJS=RCCE_test_cacheable.o  $(ARCHIVE)
+
+default:
+	@echo "Usage: make test_cacheable "
+	@echo "       make clean"
+
+test_cacheable: $(CACHEABLEOBJS)
+	$(CCOMPILE) -o test_cacheable $(CACHEABLEOBJS) $(CFLAGS)
+
+
+RCCE_test_cacheable.o: RCCE_test_cacheable.c $(RCCEINCLUDE)/RCCE.h Makefile
+	$(CCOMPILE) -c $(CFLAGS) RCCE_test_cacheable.c  
+
+clean:
+	@ rm -f *.o  test_cacheable 

+ 90 - 0
RCCE_V2.0/apps/FLUSH/.svn/text-base/RCCE_test_cacheable.c.svn-base

@@ -0,0 +1,90 @@
+#include <string.h>
+#include <stdio.h>
+#include "RCCE.h"
+
+int RCCE_APP(int argc, char **argv){
+  int iam, bufsize=1024*64, size, i, receiver, sender,
+    count_receiver, count_sender, count1, count2;
+  volatile int *buffer;
+
+  RCCE_init(&argc, &argv);
+
+  iam      = RCCE_ue();
+  receiver =1;
+  sender   =0;
+  size   = bufsize*sizeof(int);
+  buffer = (int *) RCCE_shmalloc(size);
+  count_receiver = count_sender = 0;
+
+/**********************************************************
+The sender initializes its data.
+Now this is shared data so value is "seen" by both cores.
+The receiver flushes its cache.
+***********************************************************/
+     if(iam==sender) {
+        for(i=0;i<bufsize; i++) { buffer[i]=1; }
+     }
+
+     if(iam==receiver) {RCCE_DCMflush();}
+  RCCE_barrier(&RCCE_COMM_WORLD);
+
+/**********************************************************
+The sender reads its data. 
+It reads by creating count_sender. 
+count_sender (on the sender) is 64K = 65536.
+count_sender (on the receiver) is 0.
+
+The sender modifies its data.
+Now these data are in the sender's cache. So the data may not be seen by the receiver.
+It might be seen by the receiver. We have no control when data from the cache are evicted.
+
+The sender flushes its cache 
+This guarantees that the receiver sees the data from the sender.
+***********************************************************/
+     if(iam==sender) {
+        for(i=0;i<bufsize; i++) {
+           count_sender +=buffer[i];
+           buffer[i]++;
+        }
+        RCCE_DCMflush();
+     }
+  RCCE_barrier(&RCCE_COMM_WORLD);
+
+
+/**********************************************************
+The receiver reads the data.
+It should see the data from  the sender.
+count_receiver (on the receiver) should be 128K= 131072
+count_receiver (on the sender is 0).
+***********************************************************/
+     if(iam==receiver) {
+        for(i=0;i<bufsize; i++) { 
+           count_receiver +=buffer[i]; 
+        }
+     }
+  RCCE_barrier(&RCCE_COMM_WORLD);
+
+/**********************************************************
+count1 and count 2 are on both cores.
+
+count2 contains the number of buffer entries that are 2 (which
+should be all of them). So count2 should be 64K.
+
+count1 should be 0.
+
+***********************************************************/
+
+     count1= count2= 0;
+     for(i=0;i<bufsize; i++) {
+        if(buffer[i]==2) count2++;
+        if(buffer[i]==1) count1++;
+     }
+
+     printf("LINE %d: Core %d: count_sender: %d  count_receiver: %d  count1: %d   count2: %d\n",
+       __LINE__,iam,count_sender, count_receiver,count1,count2);
+
+  RCCE_barrier(&RCCE_COMM_WORLD);
+     RCCE_shfree((t_vcharp)buffer);
+     RCCE_finalize();
+     return(0);
+}

+ 19 - 0
RCCE_V2.0/apps/FLUSH/Makefile

@@ -0,0 +1,19 @@
+SHELL=sh
+RCCEROOT=../..
+include $(RCCEROOT)/common/symbols
+
+CACHEABLEOBJS=RCCE_test_cacheable.o  $(ARCHIVE)
+
+default:
+	@echo "Usage: make test_cacheable "
+	@echo "       make clean"
+
+test_cacheable: $(CACHEABLEOBJS)
+	$(CCOMPILE) -o test_cacheable $(CACHEABLEOBJS) $(CFLAGS)
+
+
+RCCE_test_cacheable.o: RCCE_test_cacheable.c $(RCCEINCLUDE)/RCCE.h Makefile
+	$(CCOMPILE) -c $(CFLAGS) RCCE_test_cacheable.c  
+
+clean:
+	@ rm -f *.o  test_cacheable 

+ 90 - 0
RCCE_V2.0/apps/FLUSH/RCCE_test_cacheable.c

@@ -0,0 +1,90 @@
+#include <string.h>
+#include <stdio.h>
+#include "RCCE.h"
+
+int RCCE_APP(int argc, char **argv){
+  int iam, bufsize=1024*64, size, i, receiver, sender,
+    count_receiver, count_sender, count1, count2;
+  volatile int *buffer;
+
+  RCCE_init(&argc, &argv);
+
+  iam      = RCCE_ue();
+  receiver =1;
+  sender   =0;
+  size   = bufsize*sizeof(int);
+  buffer = (int *) RCCE_shmalloc(size);
+  count_receiver = count_sender = 0;
+
+/**********************************************************
+The sender initializes its data.
+Now this is shared data so value is "seen" by both cores.
+The receiver flushes its cache.
+***********************************************************/
+     if(iam==sender) {
+        for(i=0;i<bufsize; i++) { buffer[i]=1; }
+     }
+
+     if(iam==receiver) {RCCE_DCMflush();}
+  RCCE_barrier(&RCCE_COMM_WORLD);
+
+/**********************************************************
+The sender reads its data. 
+It reads by creating count_sender. 
+count_sender (on the sender) is 64K = 65536.
+count_sender (on the receiver) is 0.
+
+The sender modifies its data.
+Now these data are in the sender's cache. So the data may not be seen by the receiver.
+It might be seen by the receiver. We have no control when data from the cache are evicted.
+
+The sender flushes its cache 
+This guarantees that the receiver sees the data from the sender.
+***********************************************************/
+     if(iam==sender) {
+        for(i=0;i<bufsize; i++) {
+           count_sender +=buffer[i];
+           buffer[i]++;
+        }
+        RCCE_DCMflush();
+     }
+  RCCE_barrier(&RCCE_COMM_WORLD);
+
+
+/**********************************************************
+The receiver reads the data.
+It should see the data from  the sender.
+count_receiver (on the receiver) should be 128K= 131072
+count_receiver (on the sender is 0).
+***********************************************************/
+     if(iam==receiver) {
+        for(i=0;i<bufsize; i++) { 
+           count_receiver +=buffer[i]; 
+        }
+     }
+  RCCE_barrier(&RCCE_COMM_WORLD);
+
+/**********************************************************
+count1 and count 2 are on both cores.
+
+count2 contains the number of buffer entries that are 2 (which
+should be all of them). So count2 should be 64K.
+
+count1 should be 0.
+
+***********************************************************/
+
+     count1= count2= 0;
+     for(i=0;i<bufsize; i++) {
+        if(buffer[i]==2) count2++;
+        if(buffer[i]==1) count1++;
+     }
+
+     printf("LINE %d: Core %d: count_sender: %d  count_receiver: %d  count1: %d   count2: %d\n",
+       __LINE__,iam,count_sender, count_receiver,count1,count2);
+
+  RCCE_barrier(&RCCE_COMM_WORLD);
+     RCCE_shfree((t_vcharp)buffer);
+     RCCE_finalize();
+     return(0);
+}

+ 17 - 0
RCCE_V2.0/apps/HELLO/.svn/all-wcprops

@@ -0,0 +1,17 @@
+K 25
+svn:wc:ra_dav:version-url
+V 54
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/HELLO
+END
+Makefile
+K 25
+svn:wc:ra_dav:version-url
+V 63
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/HELLO/Makefile
+END
+RCCE_hello.c
+K 25
+svn:wc:ra_dav:version-url
+V 67
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/HELLO/RCCE_hello.c
+END

+ 96 - 0
RCCE_V2.0/apps/HELLO/.svn/entries

@@ -0,0 +1,96 @@
+10
+
+dir
+313
+http://marcbug.scc-dc.com/svn/repository/tags/RCCE_V2.0/apps/HELLO
+http://marcbug.scc-dc.com/svn/repository
+
+
+
+2011-03-24T16:10:11.693391Z
+176
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+c924d837-3317-4ba4-8fbd-5f2da8699d51
+
+RCCE_hello.c
+file
+
+
+
+
+2012-10-27T13:42:38.892598Z
+3925eebd9980ffb722501bd7395812b2
+2011-03-24T16:10:11.693391Z
+176
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+991
+
+Makefile
+file
+
+
+
+
+2012-10-27T13:42:38.892598Z
+419fe8236fac4b64be5bf20eed6d71aa
+2010-12-27T18:51:02.240775Z
+131
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+347
+

+ 19 - 0
RCCE_V2.0/apps/HELLO/.svn/text-base/Makefile.svn-base

@@ -0,0 +1,19 @@
+SHELL=sh
+RCCEROOT=../..
+include $(RCCEROOT)/common/symbols
+
+HELLOOBJS=RCCE_hello.o  $(ARCHIVE)
+
+default:
+	@echo "Usage: make hello "
+	@echo "       make clean"
+
+hello: $(HELLOOBJS)
+	$(CCOMPILE) -o hello $(HELLOOBJS) $(CFLAGS)
+
+
+RCCE_hello.o: RCCE_hello.c $(RCCEINCLUDE)/RCCE.h
+	$(CCOMPILE) -c $(CFLAGS) RCCE_hello.c  
+
+clean:
+	@ rm -f *.o  hello 

+ 37 - 0
RCCE_V2.0/apps/HELLO/.svn/text-base/RCCE_hello.c.svn-base

@@ -0,0 +1,37 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+#include <string.h>
+#include <stdio.h>
+#include "RCCE.h"
+
+
+int RCCE_APP(int argc, char **argv){
+
+  RCCE_init(&argc, &argv);
+
+  //  RCCE_debug_set(RCCE_DEBUG_ALL);
+
+#ifdef RCCE_VERSION
+  printf("Hello from RCCE ... I am %s\n",RCCE_VERSION);
+#else
+  printf("Hello from RCCE \n");
+#endif
+
+  RCCE_finalize();
+
+  return(0);
+}
+

+ 19 - 0
RCCE_V2.0/apps/HELLO/Makefile

@@ -0,0 +1,19 @@
+SHELL=sh
+RCCEROOT=../..
+include $(RCCEROOT)/common/symbols
+
+HELLOOBJS=RCCE_hello.o  $(ARCHIVE)
+
+default:
+	@echo "Usage: make hello "
+	@echo "       make clean"
+
+hello: $(HELLOOBJS)
+	$(CCOMPILE) -o hello $(HELLOOBJS) $(CFLAGS)
+
+
+RCCE_hello.o: RCCE_hello.c $(RCCEINCLUDE)/RCCE.h
+	$(CCOMPILE) -c $(CFLAGS) RCCE_hello.c  
+
+clean:
+	@ rm -f *.o  hello 

+ 37 - 0
RCCE_V2.0/apps/HELLO/RCCE_hello.c

@@ -0,0 +1,37 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+#include <string.h>
+#include <stdio.h>
+#include "RCCE.h"
+
+
+int RCCE_APP(int argc, char **argv){
+
+  RCCE_init(&argc, &argv);
+
+  //  RCCE_debug_set(RCCE_DEBUG_ALL);
+
+#ifdef RCCE_VERSION
+  printf("Hello from RCCE ... I am %s\n",RCCE_VERSION);
+#else
+  printf("Hello from RCCE \n");
+#endif
+
+  RCCE_finalize();
+
+  return(0);
+}
+

BIN
RCCE_V2.0/apps/HELLO/RCCE_hello.o


+ 11 - 0
RCCE_V2.0/apps/NPB/.svn/all-wcprops

@@ -0,0 +1,11 @@
+K 25
+svn:wc:ra_dav:version-url
+V 52
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB
+END
+Makefile
+K 25
+svn:wc:ra_dav:version-url
+V 61
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/Makefile
+END

+ 77 - 0
RCCE_V2.0/apps/NPB/.svn/entries

@@ -0,0 +1,77 @@
+10
+
+dir
+313
+http://marcbug.scc-dc.com/svn/repository/tags/RCCE_V2.0/apps/NPB
+http://marcbug.scc-dc.com/svn/repository
+
+
+
+2010-12-27T18:19:08.586526Z
+126
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+c924d837-3317-4ba4-8fbd-5f2da8699d51
+
+LU
+dir
+
+BT
+dir
+
+config
+dir
+
+common
+dir
+
+Makefile
+file
+
+
+
+
+2012-10-27T13:42:38.860598Z
+8b1616489d56e77d35e80c21073c8ac7
+2010-06-25T23:28:47.346002Z
+7
+tekubasx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1430
+
+sys
+dir
+

+ 50 - 0
RCCE_V2.0/apps/NPB/.svn/text-base/Makefile.svn-base

@@ -0,0 +1,50 @@
+SHELL=sh
+CLASS=U
+NPROCS=1
+
+RCCEROOT=../..
+include $(RCCEROOT)/common/symbols
+
+default: header
+	@ sys/print_instructions
+
+bt: clean header
+	cd sys; make
+	cd BT; $(MAKE) \
+               CFLAGS="$(CFLAGS)"           \
+               NPROCS=$(NPROCS)             \
+               CLASS=$(CLASS)               \
+               CCOMPILE=$(CCOMPILE)         \
+               RCCEROOT=$(RCCEROOT)         \
+               RCCEINCLUDE=$(RCCEINCLUDE)   \
+               RCK_LIB_SRC=$(RCK_LIB_SRC)   \
+               RCCE_LIB_SRC=$(RCCE_LIB_SRC) \
+               ARCHIVE=$(ARCHIVE)
+               
+lu: clean header
+	cd sys; make
+	cd LU; $(MAKE) \
+               CFLAGS="$(CFLAGS)"           \
+               NPROCS=$(NPROCS)             \
+               CLASS=$(CLASS)               \
+               CCOMPILE=$(CCOMPILE)         \
+               RCCEROOT=$(RCCEROOT)         \
+               RCCEINCLUDE=$(RCCEINCLUDE)   \
+               RCK_LIB_SRC=$(RCK_LIB_SRC)   \
+               RCCE_LIB_SRC=$(RCCE_LIB_SRC) \
+               ARCHIVE=$(ARCHIVE)
+               
+
+
+# It would be nice to make clean in each subdirectory (the targets
+# are defined) but on a really clean system this will won't work
+# because those makefiles need config/make.def
+clean:
+	- rm -f core 
+	- rm -f *~ */core */*~ */*.o */npbparams.h */*.obj */*.exe
+	- rm -f sys/setparams sys/makesuite sys/setparams.h
+	- rm -f BT/bt.*.* LU/lu.*.*
+
+header:
+	@ sys/print_header
+

+ 179 - 0
RCCE_V2.0/apps/NPB/BT/.svn/all-wcprops

@@ -0,0 +1,179 @@
+K 25
+svn:wc:ra_dav:version-url
+V 55
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT
+END
+header.h
+K 25
+svn:wc:ra_dav:version-url
+V 64
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/header.h
+END
+adi.c
+K 25
+svn:wc:ra_dav:version-url
+V 61
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/adi.c
+END
+work_lhs.h
+K 25
+svn:wc:ra_dav:version-url
+V 66
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/work_lhs.h
+END
+exact_solution.c
+K 25
+svn:wc:ra_dav:version-url
+V 72
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/exact_solution.c
+END
+initialize.c
+K 25
+svn:wc:ra_dav:version-url
+V 68
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/initialize.c
+END
+timers.c
+K 25
+svn:wc:ra_dav:version-url
+V 64
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/timers.c
+END
+verify.c
+K 25
+svn:wc:ra_dav:version-url
+V 64
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/verify.c
+END
+bt.c
+K 25
+svn:wc:ra_dav:version-url
+V 60
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/bt.c
+END
+setup_mpi.c
+K 25
+svn:wc:ra_dav:version-url
+V 67
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/setup_mpi.c
+END
+applu_share.h
+K 25
+svn:wc:ra_dav:version-url
+V 69
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/applu_share.h
+END
+timers.h
+K 25
+svn:wc:ra_dav:version-url
+V 64
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/timers.h
+END
+copy_faces.c
+K 25
+svn:wc:ra_dav:version-url
+V 68
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/copy_faces.c
+END
+print_results.c
+K 25
+svn:wc:ra_dav:version-url
+V 71
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/print_results.c
+END
+x_solve.c
+K 25
+svn:wc:ra_dav:version-url
+V 65
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/x_solve.c
+END
+exact_rhs.c
+K 25
+svn:wc:ra_dav:version-url
+V 67
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/exact_rhs.c
+END
+y_solve.c
+K 25
+svn:wc:ra_dav:version-url
+V 65
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/y_solve.c
+END
+z_solve.c
+K 25
+svn:wc:ra_dav:version-url
+V 65
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/z_solve.c
+END
+solve_subs.c
+K 25
+svn:wc:ra_dav:version-url
+V 68
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/solve_subs.c
+END
+set_constants.c
+K 25
+svn:wc:ra_dav:version-url
+V 71
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/set_constants.c
+END
+make_set.c
+K 25
+svn:wc:ra_dav:version-url
+V 66
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/make_set.c
+END
+add.c
+K 25
+svn:wc:ra_dav:version-url
+V 61
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/add.c
+END
+error.c
+K 25
+svn:wc:ra_dav:version-url
+V 63
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/error.c
+END
+define.c
+K 25
+svn:wc:ra_dav:version-url
+V 64
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/define.c
+END
+applu_macros.h
+K 25
+svn:wc:ra_dav:version-url
+V 70
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/applu_macros.h
+END
+mpinpb.h
+K 25
+svn:wc:ra_dav:version-url
+V 64
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/mpinpb.h
+END
+inputbt.data.sample
+K 25
+svn:wc:ra_dav:version-url
+V 75
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/inputbt.data.sample
+END
+applu_protos.h
+K 25
+svn:wc:ra_dav:version-url
+V 70
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/applu_protos.h
+END
+Makefile
+K 25
+svn:wc:ra_dav:version-url
+V 64
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/Makefile
+END
+rhs.c
+K 25
+svn:wc:ra_dav:version-url
+V 61
+/svn/repository/!svn/ver/297/tags/RCCE_V2.0/apps/NPB/BT/rhs.c
+END

Fichier diff supprimé car celui-ci est trop grand
+ 1014 - 0
RCCE_V2.0/apps/NPB/BT/.svn/entries


+ 65 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/Makefile.svn-base

@@ -0,0 +1,65 @@
+SHELL=/bin/sh
+BENCHMARK=bt
+BENCHMARKU=BT
+
+PROGRAM  = $(BENCHMARK).$(CLASS).$(NPROCS)
+
+default:: ${PROGRAM}
+
+# This makes sure the configuration utility setparams 
+# is up to date. 
+# Note that this must be run every time, which is why the
+# target does not exist and is not created. 
+# If you create a file called "config" you will break things. 
+config:
+	cd ../sys; ${MAKE} all
+	../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS}
+
+# Normally setparams updates npbparams.h only if the settings (CLASS/NPROCS)
+# have changed. However, we also want to update if the compile options
+# may have changed (set in ../config/make.def). 
+npbparams.h: ../config/make.def
+	@ echo make.def modified. Rebuilding npbparams.h just in case
+	rm -f npbparams.h
+	../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS}
+
+# So that "make benchmark-name" works
+${BENCHMARK}:  default
+${BENCHMARKU}: default
+
+bt.o:             bt.c  header.h npbparams.h  mpinpb.h
+make_set.o:       make_set.c  header.h npbparams.h  mpinpb.h
+initialize.o:     initialize.c  header.h npbparams.h
+exact_solution.o: exact_solution.c  header.h npbparams.h
+exact_rhs.o:      exact_rhs.c  header.h npbparams.h
+set_constants.o:  set_constants.c  header.h npbparams.h
+adi.o:            adi.c  header.h npbparams.h
+define.o:         define.c  header.h npbparams.h
+copy_faces.o:     copy_faces.c  header.h npbparams.h  mpinpb.h
+rhs.o:            rhs.c  header.h npbparams.h
+x_solve.o:        x_solve.c  header.h work_lhs.h npbparams.h  mpinpb.h
+y_solve.o:        y_solve.c  header.h work_lhs.h npbparams.h  mpinpb.h
+z_solve.o:        z_solve.c  header.h work_lhs.h npbparams.h  mpinpb.h
+solve_subs.o:     solve_subs.c  npbparams.h
+add.o:            add.c  header.h npbparams.h
+error.o:          error.c  header.h npbparams.h  mpinpb.h
+verify.o:         verify.c  header.h npbparams.h  mpinpb.h
+setup_mpi.o:      setup_mpi.c mpinpb.h npbparams.h 
+
+
+OBJS = bt.o make_set.o initialize.o exact_solution.o \
+       exact_rhs.o set_constants.o adi.o define.o copy_faces.o  \
+       rhs.o x_solve.o y_solve.o z_solve.o add.o solve_subs.o   \
+       error.o verify.o setup_mpi.o print_results.o timers.o $(ARCHIVE) 
+
+$(PROGRAM): ${OBJS} 
+	${CCOMPILE} ${CFLAGS} -o ${PROGRAM} ${OBJS} 
+# use line below for gcc, which does not link libm by default
+#	${CCOMPILE} ${CFLAGS} -o ${PROGRAM} ${OBJS} -lm
+
+.c.o:
+	${CCOMPILE} -c $(CFLAGS)  $<
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f  npbparams.h core

+ 44 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/add.c.svn-base

@@ -0,0 +1,44 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+#include "header.h"
+
+void  add() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     addition of update to the vector u
+//---------------------------------------------------------------------
+
+      int  c, i, j, k, m;
+
+      for (c = 1; c <= ncells; c++) {
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c);
+                  }
+               }
+            }
+         }
+      }
+
+      return;
+}

+ 34 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/adi.c.svn-base

@@ -0,0 +1,34 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+#include "header.h"
+#include "RCCE.h"
+
+void  adi() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+      copy_faces();
+      x_solve();
+      y_solve();
+      z_solve();
+      add();
+
+      return;
+}
+

+ 8 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/applu_macros.h.svn-base

@@ -0,0 +1,8 @@
+/* PAD32byte is used to compute a cacheline padded length of n (input) bytes */
+#define  PAD32byte(n) ((n)%32==0 ? (n) : (n) + 32 - (n)%32)
+/* PAD32dbl is used to compute a cacheline padded length of n (input) doubles */
+#define  PAD32dbl(n)  ((n)%(32/sizeof(double))==0 ? (n) : (n) + (32/sizeof(double)) \
+                      - (n)%(32/sizeof(double)))
+
+#define max(x,y)      ((x)>(y)? (x) : (y))
+#define min(x,y)      ((x)<(y)? (x) : (y))

+ 38 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/applu_protos.h.svn-base

@@ -0,0 +1,38 @@
+void blts(int);
+void buts(int, double *);
+void erhs();
+void error();
+void exact(int, int, int, double *);
+void exchange_1(double *, int, int);
+void exchange_3(double *, int);
+void exchange_4(double *, double *, int, int, int, int);
+void exchange_5(double *, int, int);
+void exchange_6(double *, int, int);
+void RCCE_allreduce_d(double *, double *, int, int);
+void init_comm(int *, char ***);
+void jacld(int);
+void jacu(int);
+void l2norm(int, int, int, double *, double *);
+void neighbors();
+void pintgr();
+void print_results(char *, char *, int *,  int *, int *, int *,
+                    int *, int *, double *, double *, char *,
+                    int *, char *, char *, char *, char *, char *,
+                    char *, char *, char *, char *);
+void proc_grid();
+void bcast_inputs();
+void read_input();
+void rhs();
+void setbv();
+void setcoeff();
+void setiv();
+void ssor(int);
+void subdomain();
+void timer_clear(int *);
+void timer_start(int *);
+void timer_stop(int *);
+void verify(double *, double *, double *, char *);
+int  nodedim();
+double timer_read(int *);
+double test_rsd();
+

+ 60 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/applu_share.h.svn-base

@@ -0,0 +1,60 @@
+#include "npbparams.h"
+#include "applu_protos.h"
+#include "RCCE.h"
+
+extern double u[5*(isiz1+4)*(isiz2+4)*isiz3],
+              rsd[5*(isiz1+4)*(isiz2+4)*isiz3],
+              frct[5*(isiz1+4)*(isiz2+4)*isiz3],
+              flux[5*(isiz1+2)*(isiz2+2)*isiz3];
+extern double a[5*5*isiz1*isiz2],
+              b[5*5*isiz1*isiz2],
+              c[5*5*isiz1*isiz2],
+              d[5*5*isiz1*isiz2];
+
+extern double dt, omega, tolrsd[5], rsdnm[5], errnm[5], frc, ttotal;
+extern double tolrsd1_def, tolrsd2_def, tolrsd3_def, tolrsd4_def, tolrsd5_def,
+              omega_default;
+extern double ce[5*13];
+
+extern int ndim, id, num, xdim, ydim, row, col;
+extern int ii1, ii2, ji1, ji2, ki1, ki2;
+extern int itmax, invert; 
+extern int ipr, ipr_default, inorm;
+extern int north,south,east,west;
+extern int nx0, ny0, nz0;
+extern int nx, ny, nz;
+extern int ist, iend, jst, jend, ipt, jpt;
+extern int dp_type;
+extern double tx1, ty1, tz1, 
+              dx1, dy1, dz1, 
+              tx2, ty2, tz2, 
+              dx2, dy2, dz2, 
+              tx3, ty3, tz3, 
+              dx3, dy3, dz3, 
+              dx4, dy4, dz4, 
+              dx5, dy5, dz5, 
+              dssp, c1,  c2,  
+              c3,  c4,  c5;
+extern double dxi, deta, dzeta;
+extern double npmax, maxtime;
+extern double *buf1_exch_1;
+
+#ifdef _OPENMP
+#pragma omp threadprivate (nx, ny, nz, nx0, ny0, nz0, \
+                     ipt, ist, iend, jpt, jst, jend, \
+                     ii1, ii2, ji1, ji2, ki1, ki2, \
+                     dxi, deta, dzeta, \
+                     tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3)
+#pragma omp threadprivate (dx1, dx2, dx3, dx4, dx5, \
+                     dy1, dy2, dy3, dy4, dy5, \
+                     dz1, dz2, dz3, dz4, dz5, \
+                     dssp)
+#pragma omp threadprivate(u, rsd, frct, flux)
+#pragma omp threadprivate(ipr, inorm)
+#pragma omp threadprivate(itmax, invert, \
+                    dt, omega, tolrsd, rsdnm, errnm, frc, ttotal, \
+                    a, b, c, d)
+#pragma omp threadprivate(ce)
+#pragma omp threadprivate (id, ndim, num, xdim, ydim, row, col, \
+                     north,south,east,west, buf1_exch_1, npmax, maxtime)
+#endif

+ 216 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/bt.c.svn-base

@@ -0,0 +1,216 @@
+//-------------------------------------------------------------------------!
+//                                                                         !
+//        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+//                                                                         !
+//                                   B T                                   !
+//                                                                         !
+//-------------------------------------------------------------------------!
+//                                                                         !
+//    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+//    It is described in NAS Technical Reports 95-020 and 02-007.          !
+//                                                                         !
+//    Permission to use, copy, distribute and modify this software         !
+//    for any purpose with or without fee is hereby granted.  We           !
+//    request, however, that all derived work reference the NAS            !
+//    Parallel Benchmarks 3.3. This software is provided "as is"           !
+//    without express or implied warranty.                                 !
+//                                                                         !
+//    Information on NPB 3.3, including the technical report, the          !
+//    original specifications, source code, results and information        !
+//    on how to submit new results, is available at:                       !
+//                                                                         !
+//           http://www.nas.nasa.gov/Software/NPB/                         !
+//                                                                         !
+//    Send comments or suggestions to  npb@nas.nasa.gov                    !
+//                                                                         !
+//          NAS Parallel Benchmarks Group                                  !
+//          NASA Ames Research Center                                      !
+//          Mail Stop: T27A-1                                              !
+//          Moffett Field, CA   94035-1000                                 !
+//                                                                         !
+//          E-mail:  npb@nas.nasa.gov                                      !
+//          Fax:     (650) 604-3957                                        !
+//                                                                         !
+//-------------------------------------------------------------------------!
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+//
+// Authors: R. F. Van der Wijngaart
+//          T. Harris
+//          M. Yarrow
+//
+//---------------------------------------------------------------------
+#include <stdio.h>
+#include <string.h>
+#include "RCCE.h"
+#include "applu_macros.h"
+#define G_MAIN
+#include "header.h"
+#include "mpinpb.h"
+
+#define BSIZE 132
+void make_color(void);
+void print_results(char*, char, int, int, int, int, int, int, double,
+                   double, char*, int, char*, char*, char*, char*, 
+                   char*, char*, char*, char*);
+
+//---------------------------------------------------------------------
+//      program MPBT;
+//---------------------------------------------------------------------
+int RCCE_APP(int argc, char **argv) {
+
+       int N = 1000, nothing;
+       int i, niter, step, c, error, fstatus;
+       double navg, mflops, mbytes, n3;
+       RCCE_COMM aux[N];
+
+       double t, tmax, tiominv, tpc;
+       int verified;
+       char class;
+       size_t chunk;
+
+       char cbuf[BSIZE];
+
+       if (setup_mpi(&argc, &argv)) {
+       RCCE_finalize();
+       return 0;
+       }
+
+//       RCCE_debug_set(RCCE_DEBUG_ALL);
+
+//---------------------------------------------------------------------
+//      Root node reads input file (if it exists) else takes
+//      defaults from parameters
+//---------------------------------------------------------------------
+       if (node == root) {
+          
+          printf("\n\n NAS Parallel Benchmarks 3.3 -- BT Benchmark\n");
+
+       }
+          niter = NITER_DEFAULT;
+          dt    = dt_default;
+          grid_points(1) = PROBLEM_SIZE;
+          grid_points(2) = PROBLEM_SIZE;
+          grid_points(3) = PROBLEM_SIZE;
+
+       if (node == root) {
+          printf(" Size: %4dx%4dx%4d\n", 
+                 grid_points(1), grid_points(2), grid_points(3));
+          printf(" Iterations: %4d    dt: %11.7f\n", niter, dt);
+          if (no_nodes != total_nodes)
+              printf(" Total number of processes: %5d\n", total_nodes);
+          if (no_nodes != MAXCELLS*MAXCELLS) 
+              printf(" WARNING: compiled for %5d processes\n",
+                     MAXCELLS*MAXCELLS);
+          printf(" Number of active processes: %5d\n\n", no_nodes);
+
+       }
+
+       make_set();
+       make_color();
+
+
+       for (c = 1; c <= MAXCELLS; c++) {
+          if ( (cell_size(1,c) > IMAX) ||
+               (cell_size(2,c) > JMAX) ||
+               (cell_size(3,c) > KMAX) ) {
+             printf(" %d %d %d %d %d\n", node, c, cell_size(1,c),
+                     cell_size(2,c), cell_size(3,c));
+             printf(" Problem size too big for compiled array sizes\n");
+          }
+       }
+
+       set_constants();
+
+       initialize();
+
+       lhsinit();
+
+       exact_rhs();
+
+       compute_buffer_size(5);
+
+//---------------------------------------------------------------------
+//      do one time step to touch all code, and reinitialize
+//---------------------------------------------------------------------
+       adi();
+
+       initialize();
+
+       timer_clear(2);
+
+//---------------------------------------------------------------------
+//      Synchronize before placing time stamp
+//---------------------------------------------------------------------
+       RCCE_barrier(&RCCE_COMM_WORLD);
+
+       timer_clear(1);
+       timer_start(1);
+
+       for (step = 1; step <= niter; step++) {
+
+          if (node == root) {
+             if ((step%20) == 0 || step == niter ||
+                 step == 1) {
+		 printf(" Time step %4d\n", step); fflush(stdout);
+             }
+          }
+          adi();
+       }
+
+       timer_stop(1);
+       t = timer_read(1);
+       
+       verify(niter, &class, &verified);
+
+       RCCE_reduce((char*)(&t), (char*)(&tmax), 1, RCCE_DOUBLE, RCCE_MAX, root, RCCE_COMM_WORLD);
+
+       if( node == root ) {
+          n3 = 1.0e0*grid_points(1)*grid_points(2)*grid_points(3);
+          navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0;
+          if( tmax != 0. ) {
+             mflops = 1.0e-6*(double)(niter)*
+               (3478.8*(double)n3-17655.7*navg*navg+28023.7*navg)
+               / tmax;
+          } else {
+             mflops = 0.0;
+          }
+
+         print_results("BT", class, grid_points[0], 
+           grid_points[1], grid_points[2], niter, MAXCELLS*MAXCELLS, 
+           total_nodes, tmax, mflops, "          floating point", 
+           verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, 
+           CS6);
+
+
+//         FILE *perf_file;
+//         char name[50] = "/shared/DEMOS/RCCE/NPB_BT/perf."; 
+//         char postfix[50]; 
+//         sprintf(postfix, "%d", total_nodes); 
+//         strcat(name, postfix); 
+//         perf_file = fopen(name,"w"); 
+//         fprintf(perf_file, "%d", (int)mflops); 
+//         fclose(perf_file); 
+       }
+
+
+       RCCE_finalize();
+
+       return 0;
+
+}
+

+ 338 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/copy_faces.c.svn-base

@@ -0,0 +1,338 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+#include "mpinpb.h"
+
+void copy_faces() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     
+// This function copies the face values of a variable defined on a set 
+// of cells to the overlap locations of the adjacent sets of cells. 
+// Because a set of cells interfaces in each direction with exactly one 
+// other set, we only need to fill six different buffers. We could try to
+// overlap communication with computation, by computing
+// some internal values while communicating boundary values, but this
+// adds so much overhead that it's not clearly useful. 
+//---------------------------------------------------------------------
+
+      int i, j, k, c, m, p0, p1, phase,
+           p2, p3, p4, p5, b_size[6], ss[6], 
+           sr[6], error;
+
+#define b_size(m) b_size[m]
+#define ss(m) ss[m]
+#define sr(m) sr[m]
+
+//---------------------------------------------------------------------
+//     exit immediately if there are no faces to be copied           
+//---------------------------------------------------------------------
+      if (no_nodes == 1) {
+         compute_rhs();
+         return;
+      }
+
+      ss(0) = start_send_east;
+      ss(1) = start_send_west;
+      ss(2) = start_send_north;
+      ss(3) = start_send_south;
+      ss(4) = start_send_top;
+      ss(5) = start_send_bottom;
+
+      sr(0) = start_recv_east;
+      sr(1) = start_recv_west;
+      sr(2) = start_recv_north;
+      sr(3) = start_recv_south;
+      sr(4) = start_recv_top;
+      sr(5) = start_recv_bottom;
+
+      b_size(0) = east_size   ;
+      b_size(1) = west_size   ;
+      b_size(2) = north_size  ;
+      b_size(3) = south_size  ;
+      b_size(4) = top_size    ;
+      b_size(5) = bottom_size ;
+
+//---------------------------------------------------------------------
+//     because the difference stencil for the diagonalized scheme is 
+//     orthogonal, we do not have to perform the staged copying of faces,
+//     but can send all face information simultaneously to the neighboring
+//     cells in all directions          
+//---------------------------------------------------------------------
+      p0 = 0;
+      p1 = 0;
+      p2 = 0;
+      p3 = 0;
+      p4 = 0;
+      p5 = 0;
+
+      for (c = 1; c <= ncells; c++) {
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to eastern neighbors (i-dir)
+//---------------------------------------------------------------------
+         if (cell_coord(1,c) != ncells) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = cell_size(1,c)-2; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(0)+p0) = u(m,i,j,k,c);
+                        p0 = p0 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to western neighbors 
+//---------------------------------------------------------------------
+         if (cell_coord(1,c) != 1) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= 1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(1)+p1) = u(m,i,j,k,c);
+                        p1 = p1 + 1;
+                     }
+                  }
+               }
+            }
+
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to northern neighbors (j_dir)
+//---------------------------------------------------------------------
+         if (cell_coord(2,c) != ncells) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = cell_size(2,c)-2; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(2)+p2) = u(m,i,j,k,c);
+                        p2 = p2 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to southern neighbors 
+//---------------------------------------------------------------------
+         if (cell_coord(2,c)!= 1) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= 1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(3)+p3) = u(m,i,j,k,c);
+                        p3 = p3 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to top neighbors (k-dir)
+//---------------------------------------------------------------------
+         if (cell_coord(3,c) != ncells) {
+            for (k = cell_size(3,c)-2; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(4)+p4) = u(m,i,j,k,c);
+                        p4 = p4 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to bottom neighbors
+//---------------------------------------------------------------------
+         if (cell_coord(3,c)!= 1) {
+            for (k = 0; k <= 1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(5)+p5) = u(m,i,j,k,c);
+                        p5 = p5 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     cell loop
+//---------------------------------------------------------------------
+      }
+
+      for (phase = 0; phase < 3; phase++) {
+
+      if (send_color[WESTDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(1))), b_size(1)*sizeof(double), predecessor(1));
+      }
+      if (recv_color[WESTDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(0))),  b_size(0)*sizeof(double), successor(1));
+      }
+
+      if (send_color[EASTDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(0))), b_size(0)*sizeof(double), successor(1));
+      }
+      if (recv_color[EASTDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(1))),  b_size(1)*sizeof(double), predecessor(1));
+      }
+
+      if (send_color[SOUTHDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(3))), b_size(3)*sizeof(double), predecessor(2));
+      }
+      if (recv_color[SOUTHDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(2))),  b_size(2)*sizeof(double), successor(2));
+      }
+
+      if (send_color[NORTHDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(2))), b_size(2)*sizeof(double),successor(2));
+      }
+      if (recv_color[NORTHDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(3))),  b_size(3)*sizeof(double), predecessor(2));
+      }
+
+      if (send_color[BOTTOMDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(5))), b_size(5)*sizeof(double),predecessor(3));
+      }
+      if (recv_color[BOTTOMDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(4))),  b_size(4)*sizeof(double), successor(3));
+      }
+
+      if (send_color[TOPDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(4))), b_size(4)*sizeof(double),successor(3));
+      }
+      if (recv_color[TOPDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(5))),  b_size(5)*sizeof(double), predecessor(3));
+      }
+   }      
+
+//---------------------------------------------------------------------
+//     unpack the data that has just been received;             
+//---------------------------------------------------------------------
+      p0 = 0;
+      p1 = 0;
+      p2 = 0;
+      p3 = 0;
+      p4 = 0;
+      p5 = 0;
+
+      for (c = 1; c <= ncells; c++) {
+
+         if (cell_coord(1,c) != 1) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = -2; i <= -1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(1)+p0);
+                        p0 = p0 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+         if (cell_coord(1,c) != ncells) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = cell_size(1,c); i <= cell_size(1,c)+1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(0)+p1);
+                        p1 = p1 + 1;
+                     }
+                  }
+               }
+            }
+         }
+            
+         if (cell_coord(2,c) != 1) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = -2; j <= -1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(3)+p2);
+                        p2 = p2 + 1;
+                     }
+                  }
+               }
+            }
+
+         }
+            
+         if (cell_coord(2,c) != ncells) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = cell_size(2,c); j <= cell_size(2,c)+1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(2)+p3);
+                        p3 = p3 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+         if (cell_coord(3,c) != 1) {
+            for (k = -2; k <= -1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(5)+p4);
+                        p4 = p4 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+         if (cell_coord(3,c) != ncells) {
+            for (k = cell_size(3,c); k <= cell_size(3,c)+1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(4)+p5);
+                        p5 = p5 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     cells loop
+//---------------------------------------------------------------------
+      }
+
+//---------------------------------------------------------------------
+//     do the rest of the rhs that uses the copied face values          
+//---------------------------------------------------------------------
+      compute_rhs();
+
+      return;
+}

+ 78 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/define.c.svn-base

@@ -0,0 +1,78 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void compute_buffer_size(int dim) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+      int  c, face_size;
+
+      if (ncells == 1) return;
+
+//---------------------------------------------------------------------
+//     compute the actual sizes of the buffers; note that there is 
+//     always one cell face that doesn't need buffer space, because it 
+//     is at the boundary of the grid
+//---------------------------------------------------------------------
+      west_size = 0;
+      east_size = 0;
+
+      for (c = 1; c <= ncells; c++) {
+         face_size = cell_size(2,c) * cell_size(3,c) * dim * 2;
+         if (cell_coord(1,c)!=1) west_size = west_size + face_size;
+         if (cell_coord(1,c)!=ncells) east_size = east_size + 
+              face_size ;
+      }
+
+      north_size = 0;
+      south_size = 0;
+      for (c = 1; c <= ncells; c++) {
+         face_size = cell_size(1,c)*cell_size(3,c) * dim * 2;
+         if (cell_coord(2,c)!=1) south_size = south_size + face_size;
+         if (cell_coord(2,c)!=ncells) north_size = north_size + 
+              face_size ;
+      }
+
+      top_size = 0;
+      bottom_size = 0;
+      for (c = 1; c <= ncells; c++) {
+         face_size = cell_size(1,c) * cell_size(2,c) * dim * 2;
+         if (cell_coord(3,c)!=1) bottom_size = bottom_size + 
+              face_size;
+         if (cell_coord(3,c)!=ncells) top_size = top_size +
+              face_size     ;
+      }
+
+      start_send_west   = 1;
+      start_send_east   = start_send_west   + west_size;
+      start_send_south  = start_send_east   + east_size;
+      start_send_north  = start_send_south  + south_size;
+      start_send_bottom = start_send_north  + north_size;
+      start_send_top    = start_send_bottom + bottom_size;
+      start_recv_west   = 1;
+      start_recv_east   = start_recv_west   + west_size;
+      start_recv_south  = start_recv_east   + east_size;
+      start_recv_north  = start_recv_south  + south_size;
+      start_recv_bottom = start_recv_north  + north_size;
+      start_recv_top    = start_recv_bottom + bottom_size;
+
+      return;
+}
+

+ 121 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/error.c.svn-base

@@ -0,0 +1,121 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include <math.h>
+#include "header.h"
+#include "mpinpb.h"
+#include "applu_macros.h"
+
+#define u_exact(m) u_exact[m-1]
+#define rms(m) rms[m-1]
+#define rms_work(m) rms_work[m-1]
+
+void error_norm(double rms[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     this function computes the norm of the difference between the
+//     computed solution and the exact solution
+//---------------------------------------------------------------------
+
+      int c, i, j, k, m, ii, jj, kk, d, error;
+      double xi, eta, zeta, u_exact[5], rms_work[5],
+           add;
+
+      for (m = 1; m <= 5; m++) {
+         rms_work(m) = 0.0e0;
+      }
+
+      for (c = 1; c <= ncells; c++) {
+         kk = 0;
+         for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+            zeta = (double)(k) * dnzm1;
+            jj = 0;
+            for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+               eta = (double)(j) * dnym1;
+               ii = 0;
+               for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+                  xi = (double)(i) * dnxm1;
+                  exact_solution(xi, eta, zeta, u_exact);
+
+                  for (m = 1; m <= 5; m++) {
+                     add = u(m,ii,jj,kk,c)-u_exact(m);
+                     rms_work(m) = rms_work(m) + add*add;
+                  }
+                  ii = ii + 1;
+               }
+               jj = jj + 1;
+            }
+            kk = kk + 1;
+         }
+      }
+
+      RCCE_allreduce((char*)rms_work, (char*)rms, 5, RCCE_DOUBLE, RCCE_SUM, RCCE_COMM_WORLD);
+
+      for (m = 1; m <= 5; m++) {
+         for (d = 1; d <= 3; d++) {
+            rms(m) = rms(m) / (double)(grid_points(d)-2);
+         }
+         rms(m) = sqrt(rms(m));
+      }
+
+      return;
+}
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void rhs_norm(double rms[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+      int c, i, j, k, d, m, error;
+      double rms_work[5], add;
+
+      for (m = 1; m <= 5; m++) {
+         rms_work(m) = 0.0e0;
+      }
+
+      for (c = 1; c <= ncells; c++) {
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     add = rhs(m,i,j,k,c);
+                     rms_work(m) = rms_work(m) + add*add;
+                  }
+               }
+            }
+         }
+      }
+
+      RCCE_allreduce((char*)rms_work, (char*)rms, 5, RCCE_DOUBLE, RCCE_SUM, RCCE_COMM_WORLD);
+
+      for (m = 1; m <= 5; m++) {
+         for (d = 1; d <= 3; d++) {
+            rms(m) = rms(m) / (double)(grid_points(d)-2);
+         }
+         rms(m) = sqrt(rms(m));
+      }
+
+      return;
+}
+

+ 375 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/exact_rhs.c.svn-base

@@ -0,0 +1,375 @@
+
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void exact_rhs() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     compute the right hand side based on exact solution
+//---------------------------------------------------------------------
+
+      double dtemp[5], xi, eta, zeta, dtpp;
+      int          c, m, i, j, k, ip1, im1, jp1, 
+           jm1, km1, kp1;
+#define dtemp(m) dtemp[m-1]
+
+
+//---------------------------------------------------------------------
+//     loop over all cells owned by this node                   
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+
+//---------------------------------------------------------------------
+//     initialize                                  
+//---------------------------------------------------------------------
+         for (k = 0; k <= cell_size(3,c)-1; k++) {
+            for (j = 0; j <= cell_size(2,c)-1; j++) {
+               for (i = 0; i <= cell_size(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = 0.0e0;
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     xi-direction flux differences                      
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            zeta = (double)(k+cell_low(3,c)) * dnzm1;
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               eta = (double)(j+cell_low(2,c)) * dnym1;
+
+               for (i = -2*(1-start(1,c)); i <= cell_size(1,c)+1-2*end(1,c); i++) {
+                  xi = (double)(i+cell_low(1,c)) * dnxm1;
+
+                  exact_solution(xi, eta, zeta, dtemp);
+                  for (m = 1; m <= 5; m++) {
+                     ue(i,m) = dtemp(m);
+                  }
+
+                  dtpp = 1.0e0 / dtemp(1);
+
+                  for (m = 2; m <= 5; m++) {
+                     buf(i,m) = dtpp * dtemp(m);
+                  }
+
+                  cuf(i)   = buf(i,2) * buf(i,2);
+                  buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + 
+                       buf(i,4) * buf(i,4) ;
+                  q(i) = 0.5e0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
+                       buf(i,4)*ue(i,4));
+
+               }
+               
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  im1 = i-1;
+                  ip1 = i+1;
+
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+                       tx2*( ue(ip1,2)-ue(im1,2) )+
+                       dx1tx1*(ue(ip1,1)-2.0e0*ue(i,1)+ue(im1,1));
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * (
+                       (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
+                       (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
+                       xxcon1*(buf(ip1,2)-2.0e0*buf(i,2)+buf(im1,2))+
+                       dx2tx1*( ue(ip1,2)-2.0e0* ue(i,2)+ue(im1,2));
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * (
+                       ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
+                       xxcon2*(buf(ip1,3)-2.0e0*buf(i,3)+buf(im1,3))+
+                       dx3tx1*( ue(ip1,3)-2.0e0*ue(i,3) +ue(im1,3));
+                  
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*(
+                       ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
+                       xxcon2*(buf(ip1,4)-2.0e0*buf(i,4)+buf(im1,4))+
+                       dx4tx1*( ue(ip1,4)-2.0e0* ue(i,4)+ ue(im1,4));
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*(
+                       buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
+                       buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
+                       0.5e0*xxcon3*(buf(ip1,1)-2.0e0*buf(i,1)+
+                       buf(im1,1))+
+                       xxcon4*(cuf(ip1)-2.0e0*cuf(i)+cuf(im1))+
+                       xxcon5*(buf(ip1,5)-2.0e0*buf(i,5)+buf(im1,5))+
+                       dx5tx1*( ue(ip1,5)-2.0e0* ue(i,5)+ ue(im1,5));
+               }
+
+//---------------------------------------------------------------------
+//     Fourth-order dissipation                         
+//---------------------------------------------------------------------
+               if (start(1,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     i = 1;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (5.0e0*ue(i,m) - 4.0e0*ue(i+1,m) +ue(i+2,m));
+                     i = 2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (-4.0e0*ue(i-1,m) + 6.0e0*ue(i,m) -
+                          4.0e0*ue(i+1,m) +       ue(i+2,m));
+                  }
+               }
+
+               for (i = start(1,c)*3; i <= cell_size(1,c)-3*end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+                          (ue(i-2,m) - 4.0e0*ue(i-1,m) +
+                          6.0e0*ue(i,m) - 4.0e0*ue(i+1,m) + ue(i+2,m));
+                  }
+               }
+
+               if (end(1,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     i = cell_size(1,c)-3;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(i-2,m) - 4.0e0*ue(i-1,m) +
+                          6.0e0*ue(i,m) - 4.0e0*ue(i+1,m));
+                     i = cell_size(1,c)-2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(i-2,m) - 4.0e0*ue(i-1,m) + 5.0e0*ue(i,m));
+                  }
+               }
+
+            }
+         }
+
+//---------------------------------------------------------------------
+//     eta-direction flux differences             
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            zeta = (double)(k+cell_low(3,c)) * dnzm1;
+            for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+               xi = (double)(i+cell_low(1,c)) * dnxm1;
+
+               for (j = -2*(1-start(2,c)); j <= cell_size(2,c)+1-2*end(2,c); j++) {
+                  eta = (double)(j+cell_low(2,c)) * dnym1;
+
+                  exact_solution(xi, eta, zeta, dtemp);
+                  for (m = 1; m <= 5; m++) {
+                     ue(j,m) = dtemp(m);
+                  }
+                  
+                  dtpp = 1.0e0/dtemp(1);
+
+                  for (m = 2; m <= 5; m++) {
+                     buf(j,m) = dtpp * dtemp(m);
+                  }
+
+                  cuf(j)   = buf(j,3) * buf(j,3);
+                  buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + 
+                       buf(j,4) * buf(j,4);
+                  q(j) = 0.5e0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
+                       buf(j,4)*ue(j,4));
+               }
+
+               for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+                  jm1 = j-1;
+                  jp1 = j+1;
+                  
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+                       ty2*( ue(jp1,3)-ue(jm1,3) )+
+                       dy1ty1*(ue(jp1,1)-2.0e0*ue(j,1)+ue(jm1,1));
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*(
+                       ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
+                       yycon2*(buf(jp1,2)-2.0e0*buf(j,2)+buf(jm1,2))+
+                       dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2));
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*(
+                       (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
+                       (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
+                       yycon1*(buf(jp1,3)-2.0e0*buf(j,3)+buf(jm1,3))+
+                       dy3ty1*( ue(jp1,3)-2.0e0*ue(j,3) +ue(jm1,3));
+
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*(
+                       ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
+                       yycon2*(buf(jp1,4)-2.0e0*buf(j,4)+buf(jm1,4))+
+                       dy4ty1*( ue(jp1,4)-2.0e0*ue(j,4)+ ue(jm1,4));
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*(
+                       buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
+                       buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
+                       0.5e0*yycon3*(buf(jp1,1)-2.0e0*buf(j,1)+
+                       buf(jm1,1))+
+                       yycon4*(cuf(jp1)-2.0e0*cuf(j)+cuf(jm1))+
+                       yycon5*(buf(jp1,5)-2.0e0*buf(j,5)+buf(jm1,5))+
+                       dy5ty1*(ue(jp1,5)-2.0e0*ue(j,5)+ue(jm1,5));
+               }
+
+//---------------------------------------------------------------------
+//     Fourth-order dissipation                      
+//---------------------------------------------------------------------
+               if (start(2,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     j = 1;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (5.0e0*ue(j,m) - 4.0e0*ue(j+1,m) +ue(j+2,m));
+                     j = 2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (-4.0e0*ue(j-1,m) + 6.0e0*ue(j,m) -
+                          4.0e0*ue(j+1,m) +       ue(j+2,m));
+                  }
+               }
+
+               for (j = start(2,c)*3; j <= cell_size(2,c)-3*end(2,c)-1; j++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+                          (ue(j-2,m) - 4.0e0*ue(j-1,m) +
+                          6.0e0*ue(j,m) - 4.0e0*ue(j+1,m) + ue(j+2,m));
+                  }
+               }
+
+               if (end(2,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     j = cell_size(2,c)-3;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(j-2,m) - 4.0e0*ue(j-1,m) +
+                          6.0e0*ue(j,m) - 4.0e0*ue(j+1,m));
+                     j = cell_size(2,c)-2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(j-2,m) - 4.0e0*ue(j-1,m) + 5.0e0*ue(j,m));
+
+                  }
+               }
+
+            }
+         }
+
+//---------------------------------------------------------------------
+//     zeta-direction flux differences                      
+//---------------------------------------------------------------------
+         for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+            eta = (double)(j+cell_low(2,c)) * dnym1;
+            for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+               xi = (double)(i+cell_low(1,c)) * dnxm1;
+
+               for (k = -2*(1-start(3,c)); k <= cell_size(3,c)+1-2*end(3,c); k++) {
+                  zeta = (double)(k+cell_low(3,c)) * dnzm1;
+
+                  exact_solution(xi, eta, zeta, dtemp);
+                  for (m = 1; m <= 5; m++) {
+                     ue(k,m) = dtemp(m);
+                  }
+
+                  dtpp = 1.0e0/dtemp(1);
+
+                  for (m = 2; m <= 5; m++) {
+                     buf(k,m) = dtpp * dtemp(m);
+                  }
+
+                  cuf(k)   = buf(k,4) * buf(k,4);
+                  buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + 
+                       buf(k,3) * buf(k,3);
+                  q(k) = 0.5e0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
+                       buf(k,4)*ue(k,4));
+               }
+
+               for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+                  km1 = k-1;
+                  kp1 = k+1;
+                  
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+                       tz2*( ue(kp1,4)-ue(km1,4) )+
+                       dz1tz1*(ue(kp1,1)-2.0e0*ue(k,1)+ue(km1,1));
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * (
+                       ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
+                       zzcon2*(buf(kp1,2)-2.0e0*buf(k,2)+buf(km1,2))+
+                       dz2tz1*( ue(kp1,2)-2.0e0* ue(k,2)+ ue(km1,2));
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * (
+                       ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
+                       zzcon2*(buf(kp1,3)-2.0e0*buf(k,3)+buf(km1,3))+
+                       dz3tz1*(ue(kp1,3)-2.0e0*ue(k,3)+ue(km1,3));
+
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * (
+                       (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
+                       (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
+                       zzcon1*(buf(kp1,4)-2.0e0*buf(k,4)+buf(km1,4))+
+                       dz4tz1*( ue(kp1,4)-2.0e0*ue(k,4) +ue(km1,4));
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * (
+                       buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
+                       buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
+                       0.5e0*zzcon3*(buf(kp1,1)-2.0e0*buf(k,1)
+                       +buf(km1,1))+
+                       zzcon4*(cuf(kp1)-2.0e0*cuf(k)+cuf(km1))+
+                       zzcon5*(buf(kp1,5)-2.0e0*buf(k,5)+buf(km1,5))+
+                       dz5tz1*( ue(kp1,5)-2.0e0*ue(k,5)+ ue(km1,5));
+               }
+
+//---------------------------------------------------------------------
+//     Fourth-order dissipation                        
+//---------------------------------------------------------------------
+               if (start(3,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     k = 1;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (5.0e0*ue(k,m) - 4.0e0*ue(k+1,m) +ue(k+2,m));
+                     k = 2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (-4.0e0*ue(k-1,m) + 6.0e0*ue(k,m) -
+                          4.0e0*ue(k+1,m) +       ue(k+2,m));
+                  }
+               }
+
+               for (k = start(3,c)*3; k <= cell_size(3,c)-3*end(3,c)-1; k++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+                          (ue(k-2,m) - 4.0e0*ue(k-1,m) +
+                          6.0e0*ue(k,m) - 4.0e0*ue(k+1,m) + ue(k+2,m));
+                  }
+               }
+
+               if (end(3,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     k = cell_size(3,c)-3;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(k-2,m) - 4.0e0*ue(k-1,m) +
+                          6.0e0*ue(k,m) - 4.0e0*ue(k+1,m));
+                     k = cell_size(3,c)-2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(k-2,m) - 4.0e0*ue(k-1,m) + 5.0e0*ue(k,m));
+                  }
+               }
+
+            }
+         }
+
+//---------------------------------------------------------------------
+//     now change the sign of the forcing function, 
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = -1.e0 * forcing(m,i,j,k,c);
+                  }
+               }
+            }
+         }
+
+      }
+
+      return;
+}

+ 43 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/exact_solution.c.svn-base

@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void exact_solution(double xi,double eta,double zeta,double dtemp[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     this function returns the exact solution at point xi, eta, zeta  
+//---------------------------------------------------------------------
+
+      int m;
+#define dtemp(m) dtemp[m-1]
+
+      for (m = 1; m <= 5; m++) {
+         dtemp(m) =  ce(m,1) +
+           xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
+           eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
+           zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + 
+           zeta*ce(m,13))));
+      }
+
+      return;
+}
+
+

+ 287 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/header.h.svn-base

@@ -0,0 +1,287 @@
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+//
+//  header.h
+//
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+#ifndef __HEADER_H
+#define __HEADER_H
+
+//---------------------------------------------------------------------
+// The following include file is generated automatically by the
+// "setparams" utility. It defines 
+//      maxcells:      the square root of the maximum number of processors
+//      problem_size:  12, 64, 102, 162 (for class T, A, B, C)
+//      dt_default:    default time step for this problem size if no
+//                     config file
+//      niter_default: default number of iterations for this problem size
+//---------------------------------------------------------------------
+
+#include "npbparams.h"
+#include "RCCE.h"
+//we introduce the next definition to avoid confusing the compiler, which
+//sometimes thinks the variable class is a reserved word
+#define class _class_
+#include "../common/common.h"
+
+#define AA 0
+#define BB 1
+#define CC 2
+#define BLOCK_SIZE 5
+
+#define EAST   2000
+#define WEST   3000
+#define NORTH  4000
+#define SOUTH  5000
+#define BOTTOM 6000
+#define TOP    7000
+
+#define WESTDIR   0
+#define EASTDIR   1
+#define SOUTHDIR  2
+#define NORTHDIR  3
+#define BOTTOMDIR 4
+#define TOPDIR    5
+
+#define MAX_CELL_DIM ((PROBLEM_SIZE/MAXCELLS)+1)
+#define IMAX MAX_CELL_DIM
+#define JMAX MAX_CELL_DIM
+#define KMAX MAX_CELL_DIM
+
+#define BUF_SIZE (MAX_CELL_DIM*MAX_CELL_DIM*(MAXCELLS-1)*60+1)
+
+#define SQR(x) (x)*(x)
+
+#define grid_points(m) grid_points[m-1]
+#define ce(m,n) ce[(m-1)+5*(n-1)]
+#define cell_coord(m,n) cell_coord[(m-1)+3*(n-1)]
+#define cell_low(m,n) cell_low[(m-1)+3*(n-1)]
+#define cell_high(m,n) cell_high[(m-1)+3*(n-1)]
+#define cell_size(m,n) cell_size[(m-1)+3*(n-1)]
+#define predecessor(m) predecessor[m-1]
+#define slice(m,n) slice[(m-1)+3*(n-1)]
+#define grid_size(m) grid_size[m-1]
+#define successor(m) successor[m-1]
+#define start(m,n) start[(m-1)+3*(n-1)]
+#define end(m,n) end[(m-1)+3*(n-1)]
+#define us(i,j,k,c) us[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define vs(i,j,k,c) vs[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define ws(i,j,k,c) ws[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define qs(i,j,k,c) qs[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define rho_i(i,j,k,c) rho_i[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define square(i,j,k,c) square[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define forcing(m,i,j,k,c) forcing[(m-1)+5*(i+IMAX*(j+JMAX*(k+KMAX*(c-1))))]
+#define u(m,i,j,k,c) u[(m-1)+5*((i+2)+(IMAX+4)*((j+2)+(JMAX+4)*((k+2)+(KMAX+4)*(c-1))))]
+#define rhs(m,i,j,k,c) rhs[(m-1)+5*((i+1)+(IMAX+1)*((j+1)+(JMAX+1)*((k+1)+(KMAX+1)*(c-1))))]
+#define lhsc(m,n,i,j,k,c) lhsc[(m-1)+5*((n-1)+5*((i+1)+(IMAX+1)*((j+1)+(JMAX+1)*((k+1)+(KMAX+1)*(c-1)))))]
+#define backsub_info(m,i,j,c) backsub_info[(m-1)+5*((i)+(IMAX+1)*((j)+(JMAX+1)*(c-1)))]
+#define in_buffer(i) in_buffer[i-1]
+#define out_buffer(i) out_buffer[i-1]
+#define cv(m) cv[m+2]
+#define rhon(m) rhon[m+2]
+#define rhos(m) rhos[m+2]
+#define rhoq(m) rhoq[m+2]
+#define cuf(m) cuf[m+2]
+#define q(m) q[m+2]
+#define ue(m,n) ue[(m+2)+(MAX_CELL_DIM+4)*(n-1)]
+#define buf(m,n) buf[(m+2)+(MAX_CELL_DIM+4)*(n-1)]
+#define sum(m) sum[m-1]
+#define xce_sub(m) xce_sub[m-1]
+
+
+#ifdef G_MAIN
+      int     ncells, grid_points[3];
+      double  elapsed_time;
+
+      double  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
+                        dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+                        dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+                        ce[5*13], dxmax, dymax, dzmax, xxcon1, xxcon2, 
+                        xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+                        dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+                        yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+                        zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+                        dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+                        dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+                        c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+                        dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+                        c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+                        c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16;
+
+      int     cell_coord[MAXCELLS*3], cell_low[MAXCELLS*3], 
+              cell_high[MAXCELLS*3],  cell_size[MAXCELLS*3],
+              predecessor[3],         slice[MAXCELLS*3],
+              grid_size[3],           successor[3],
+              start[MAXCELLS*3],      end[MAXCELLS*3];
+
+      double 
+         us      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         vs      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         ws      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         qs      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         rho_i   [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         square  [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         forcing [5*IMAX*JMAX*KMAX*MAXCELLS],
+         u       [5*(IMAX+4)*(JMAX+4)*(KMAX+4)*MAXCELLS],
+         rhs     [5*(IMAX+1)*(JMAX+1)*(KMAX+1)*MAXCELLS],
+         lhsc    [5*5*(IMAX+1)*(JMAX+1)*(KMAX+1)*MAXCELLS],
+         backsub_info [5*(MAX_CELL_DIM+1)*(MAX_CELL_DIM+1)*MAXCELLS],
+         in_buffer[BUF_SIZE], out_buffer[BUF_SIZE];
+
+      double cv[MAX_CELL_DIM+4],   rhon[MAX_CELL_DIM+4],
+             rhos[MAX_CELL_DIM+4], rhoq[MAX_CELL_DIM+4],
+             cuf[MAX_CELL_DIM+4],  q[MAX_CELL_DIM+4],
+             ue[(MAX_CELL_DIM+4)*5], buf[(MAX_CELL_DIM+4)*5];
+
+      int  west_size, east_size, bottom_size, top_size,
+               north_size, south_size, start_send_west, 
+               start_send_east, start_send_south, start_send_north,
+               start_send_bottom, start_send_top, start_recv_west,
+               start_recv_east, start_recv_south, start_recv_north,
+               start_recv_bottom, start_recv_top;
+//
+//     These are used by btio
+//
+      int collbuf_nodes, collbuf_size, iosize,
+              idump, record_length,
+              idump_sub, rd_interval;
+      double sum[NITER_DEFAULT], xce_sub[5];
+      long int iseek;
+      int    send_color[6], recv_color[6];
+#else
+extern int     ncells, grid_points[3];
+extern double  elapsed_time;
+
+extern double  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
+                        dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+                        dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+                        ce[5*13], dxmax, dymax, dzmax, xxcon1, xxcon2, 
+                        xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+                        dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+                        yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+                        zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+                        dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+                        dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+                        c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+                        dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+                        c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+                        c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16;
+
+extern int    cell_coord[MAXCELLS*3], cell_low[MAXCELLS*3], 
+              cell_high[MAXCELLS*3],  cell_size[MAXCELLS*3],
+              predecessor[3],         slice[MAXCELLS*3],
+              grid_size[3],           successor[3],
+              start[MAXCELLS*3],      end[MAXCELLS*3];
+
+extern double 
+         us      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         vs      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         ws      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         qs      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         rho_i   [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         square  [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         forcing [5*IMAX*JMAX*KMAX*MAXCELLS],
+         u       [5*(IMAX+4)*(JMAX+4)*(KMAX+4)*MAXCELLS],
+         rhs     [5*(IMAX+1)*(JMAX+1)*(KMAX+1)*MAXCELLS],
+         lhsc    [5*5*(IMAX+1)*(JMAX+1)*(KMAX+1)*MAXCELLS],
+         backsub_info [5*(MAX_CELL_DIM+1)*(MAX_CELL_DIM+1)*MAXCELLS],
+         in_buffer[BUF_SIZE], out_buffer[BUF_SIZE];
+
+extern double cv[MAX_CELL_DIM+4],   rhon[MAX_CELL_DIM+4],
+             rhos[MAX_CELL_DIM+4], rhoq[MAX_CELL_DIM+4],
+             cuf[MAX_CELL_DIM+4],  q[MAX_CELL_DIM+4],
+             ue[(MAX_CELL_DIM+4)*5], buf[(MAX_CELL_DIM+4)*5];
+
+extern int  west_size, east_size, bottom_size, top_size,
+               north_size, south_size, start_send_west, 
+               start_send_east, start_send_south, start_send_north,
+               start_send_bottom, start_send_top, start_recv_west,
+               start_recv_east, start_recv_south, start_recv_north,
+               start_recv_bottom, start_recv_top;
+
+//
+//     These are used by btio
+//
+extern int collbuf_nodes, collbuf_size, iosize,
+              idump, record_length,
+              idump_sub, rd_interval;
+extern double sum[NITER_DEFAULT], xce_sub[5];
+extern long int iseek;
+extern int    send_color[6], recv_color[6];
+
+#endif /*G_MAIN*/
+
+extern void matvec_sub(double ablock[], double avec[], double bvec[]);
+extern void matmul_sub(double ablock[], double bblock[], double cblock[]);
+extern void binvcrhs( double lhs[], double c[], double r[] );
+extern void binvrhs( double lhs[], double r[] );
+extern void exact_solution(double xi,double eta,double zeta,double dtemp[]);
+
+extern int setup_mpi(int *argc, char ***argv);
+extern void make_set(void);
+extern void set_constants(void);
+extern void lhsinit(void);
+extern void lhsabinit(double lhsa[], double lhsb[], int size);
+extern void initialize(void);
+extern void exact_rhs(void);
+extern void compute_buffer_size(int c);
+extern void adi(void);
+extern void compute_rhs(void);
+extern void copy_faces(void);
+extern void x_solve(void);
+extern void y_solve(void);
+extern void z_solve(void);
+extern void add(void);
+extern void verify(int niter, char *class, int *verified);
+extern void error_norm(double rms[]);
+extern void rhs_norm(double rms[]);
+
+extern void setup_btio(void);
+extern void output_timestep(void);
+extern void btio_cleanup(void);
+extern void btio_verify(int *verified);
+extern void accumulate_norms(double xce[]);
+extern void clear_timestep(void);
+
+#endif
+
+#ifdef _OPENMP
+#pragma omp threadprivate (cell_coord, cell_low, cell_high,  cell_size)
+#pragma omp threadprivate (predecessor, slice, grid_size, successor)
+#pragma omp threadprivate (start, end)
+
+#pragma omp threadprivate (ncells, grid_points, elapsed_time)
+#pragma omp threadprivate (tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, \
+                           dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, \
+                           dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, \
+                           ce, dxmax, dymax, dzmax, xxcon1, xxcon2, \
+                           xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, \
+                           dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, \
+                           yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, \
+                           zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, \
+                           dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, \
+                           dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, \
+                           c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, \
+                           dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, \
+                           c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, \
+                           c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16)
+
+#pragma omp threadprivate (us, vs, ws, qs, rho_i, square, forcing, \
+                           u, rhs, lhsc, backsub_info, in_buffer, out_buffer)
+
+#pragma omp threadprivate (cv, rhon, rhos, rhoq, cuf, q, ue, buf)
+
+#pragma omp threadprivate (west_size, east_size, bottom_size, top_size, \
+                           north_size, south_size, start_send_west, \
+                           start_send_east, start_send_south, start_send_north, \
+                           start_send_bottom, start_send_top, start_recv_west, \
+                           start_recv_east, start_recv_south, start_recv_north, \
+                           start_recv_bottom, start_recv_top, send_color, recv_color)
+//
+//     These are used by btio
+//
+#pragma omp threadprivate (collbuf_nodes, collbuf_size, iosize, idump,\
+                           record_length, idump_sub, rd_interval, \
+                           sum, xce_sub, iseek)
+#endif

+ 321 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/initialize.c.svn-base

@@ -0,0 +1,321 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void  initialize() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     This subroutine initializes the field variable u using 
+//     tri-linear transfinite interpolation of the boundary values     
+//---------------------------------------------------------------------
+      
+      int c, i, j, k, m, ii, jj, kk, ix, iy, iz;
+      double xi, eta, zeta, Pface[5*3*2], Pxi, Peta, 
+           Pzeta, temp[5];
+#define Pface(m,n,i) Pface[(m-1)+5*((n-1)+3*(i-1))]
+#define temp(m) temp[m-1]
+
+//---------------------------------------------------------------------
+//  Later (in compute_rhs) we compute 1/u for every element. A few of 
+//  the corner elements are not used, but it convenient (and faster) 
+//  to compute the whole thing with a simple loop. Make sure those 
+//  values are nonzero by initializing the whole thing here. 
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+         for (kk = -1; kk <= KMAX; kk++) {
+            for (jj = -1; jj <= JMAX; jj++) {
+               for (ii = -1; ii <= IMAX; ii++) {
+                  for (m = 1; m <= 5; m++) {
+                     u(m, ii, jj, kk, c) = 1.0;
+                  }
+               }
+            }
+         }
+      }
+//---------------------------------------------------------------------
+
+
+
+//---------------------------------------------------------------------
+//     first store the "interpolated" values everywhere on the grid    
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+         kk = 0;
+         for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+            zeta = (double)(k) * dnzm1;
+            jj = 0;
+            for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+               eta = (double)(j) * dnym1;
+               ii = 0;
+               for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+                  xi = (double)(i) * dnxm1;
+                  
+                  for (ix = 1; ix <= 2; ix++) {
+                     exact_solution((double)(ix-1), eta, zeta, 
+                          &Pface(1,1,ix));
+                  }
+
+                  for (iy = 1; iy <= 2; iy++) {
+                     exact_solution(xi, (double)(iy-1) , zeta, 
+                          &Pface(1,2,iy));
+                  }
+
+                  for (iz = 1; iz <= 2; iz++) {
+                     exact_solution(xi, eta, (double)(iz-1),   
+                          &Pface(1,3,iz));
+                  }
+
+                  for (m = 1; m <= 5; m++) {
+                     Pxi   = xi   * Pface(m,1,2) + 
+                          (1.0e0-xi)   * Pface(m,1,1);
+                     Peta  = eta  * Pface(m,2,2) + 
+                          (1.0e0-eta)  * Pface(m,2,1);
+                     Pzeta = zeta * Pface(m,3,2) + 
+                          (1.0e0-zeta) * Pface(m,3,1);
+                     
+                     u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta - 
+                          Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + 
+                          Pxi*Peta*Pzeta;
+
+                  }
+                  ii = ii + 1;
+               }
+               jj = jj + 1;
+            }
+            kk = kk+1;
+         }
+      }
+
+//---------------------------------------------------------------------
+//     now store the exact values on the boundaries        
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     west face                                                  
+//---------------------------------------------------------------------
+      c = slice(1,1);
+      ii = 0;
+      xi = 0.0e0;
+      kk = 0;
+      for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+         zeta = (double)(k) * dnzm1;
+         jj = 0;
+         for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+            eta = (double)(j) * dnym1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            jj = jj + 1;
+         }
+         kk = kk + 1;
+      }
+
+//---------------------------------------------------------------------
+//     east face                                                      
+//---------------------------------------------------------------------
+      c  = slice(1,ncells);
+      ii = cell_size(1,c)-1;
+      xi = 1.0e0;
+      kk = 0;
+      for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+         zeta = (double)(k) * dnzm1;
+         jj = 0;
+         for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+            eta = (double)(j) * dnym1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            jj = jj + 1;
+         }
+         kk = kk + 1;
+      }
+
+//---------------------------------------------------------------------
+//     south face                                                 
+//---------------------------------------------------------------------
+      c = slice(2,1);
+      jj = 0;
+      eta = 0.0e0;
+      kk = 0;
+      for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+         zeta = (double)(k) * dnzm1;
+         ii = 0;
+         for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+            xi = (double)(i) * dnxm1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            ii = ii + 1;
+         }
+         kk = kk + 1;
+      }
+
+
+//---------------------------------------------------------------------
+//     north face                                    
+//---------------------------------------------------------------------
+      c = slice(2,ncells);
+      jj = cell_size(2,c)-1;
+      eta = 1.0e0;
+      kk = 0;
+      for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+         zeta = (double)(k) * dnzm1;
+         ii = 0;
+         for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+            xi = (double)(i) * dnxm1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            ii = ii + 1;
+         }
+         kk = kk + 1;
+      }
+
+//---------------------------------------------------------------------
+//     bottom face                                       
+//---------------------------------------------------------------------
+      c = slice(3,1);
+      kk = 0;
+      zeta = 0.0e0;
+      jj = 0;
+      for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+         eta = (double)(j) * dnym1;
+         ii = 0;
+         for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+            xi = (double)(i) *dnxm1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            ii = ii + 1;
+         }
+         jj = jj + 1;
+      }
+
+//---------------------------------------------------------------------
+//     top face     
+//---------------------------------------------------------------------
+      c = slice(3,ncells);
+      kk = cell_size(3,c)-1;
+      zeta = 1.0e0;
+      jj = 0;
+      for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+         eta = (double)(j) * dnym1;
+         ii = 0;
+         for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+            xi = (double)(i) * dnxm1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            ii = ii + 1;
+         }
+         jj = jj + 1;
+      }
+
+      return;
+}
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void lhsinit() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+      
+      int i, j, k, d, c, m, n;
+
+//---------------------------------------------------------------------
+//     loop over all cells                                       
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+
+//---------------------------------------------------------------------
+//     first, initialize the start and end arrays
+//---------------------------------------------------------------------
+         for (d = 1; d <= 3; d++) {
+            if (cell_coord(d,c) == 1) {
+               start(d,c) = 1;
+            } else {
+               start(d,c) = 0;
+            }
+            if (cell_coord(d,c) == ncells) {
+               end(d,c) = 1;
+            } else {
+               end(d,c) = 0;
+            }
+         }
+
+//---------------------------------------------------------------------
+//     zero the whole left hand side for starters
+//---------------------------------------------------------------------
+         for (k = 0; k <= cell_size(3,c)-1; k++) {
+            for (j = 0; j <= cell_size(2,c)-1; j++) {
+               for (i = 0; i <= cell_size(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     for (n = 1; n <= 5; n++) {
+                        lhsc(m,n,i,j,k,c) = 0.0e0;
+                     }
+                  }
+               }
+            }
+         }
+
+      }
+
+      return;
+}
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void lhsabinit(double lhsa[], double lhsb[], int size) {
+
+#define lhsa(m,n,i) lhsa[(m-1)+5*((n-1)+5*(i+1))]
+#define lhsb(m,n,i) lhsb[(m-1)+5*((n-1)+5*(i+1))]
+
+      int i, m, n;
+
+//---------------------------------------------------------------------
+//     next, set all diagonal values to 1. This is overkill, but convenient
+//---------------------------------------------------------------------
+      for (i = 0; i <= size; i++) {
+         for (m = 1; m <= 5; m++) {
+            for (n = 1; n <= 5; n++) {
+               lhsa(m,n,i) = 0.0e0;
+               lhsb(m,n,i) = 0.0e0;
+            }
+            lhsb(m,m,i) = 1.0e0;
+         }
+      }
+
+      return;
+}
+
+
+

+ 5 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/inputbt.data.sample.svn-base

@@ -0,0 +1,5 @@
+200       number of time steps
+0.0008d0  dt for class A = 0.0008d0. class B = 0.0003d0  class C = 0.0001d0
+64 64 64
+5 0        write interval (optional read interval) for BTIO
+0 1000000  number of nodes in collective buffering and buffer size for BTIO

+ 222 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/make_set.c.svn-base

@@ -0,0 +1,222 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "header.h"
+#include "mpinpb.h"
+
+#define mod(p,q) ((p)%(q))
+#define max(x,y)      ((x)>(y)? (x) : (y))
+#define min(x,y)      ((x)<(y)? (x) : (y))
+
+void make_set() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     This function allocates space for a set of cells and fills the set
+//     such that communication between cells on different nodes is only
+//     nearest neighbor
+//---------------------------------------------------------------------
+
+
+      int p, i, j, c, dir, size, excess, ierr,ierrcode;
+
+//---------------------------------------------------------------------
+//     compute square root; add small number to allow for roundoff
+//     (note: this is computed in setup_mpi.f also, but prefer to do
+//     it twice because of some include file problems).
+//---------------------------------------------------------------------
+      ncells = (int)(sqrt((double)(no_nodes) + 0.00001e0));
+
+//---------------------------------------------------------------------
+//     this makes coding easier
+//---------------------------------------------------------------------
+      p = ncells;
+      
+//---------------------------------------------------------------------
+//     determine the location of the cell at the bottom of the 3D 
+//     array of cells
+//---------------------------------------------------------------------
+      cell_coord(1,1) = mod(node,p) ;
+      cell_coord(2,1) = node/p ;
+      cell_coord(3,1) = 0;
+
+//---------------------------------------------------------------------
+//     set the cell_coords for cells in the rest of the z-layers; 
+//     this comes down to a simple linear numbering in the z-direct-
+//     ion, and to the doubly-cyclic numbering in the other dirs     
+//---------------------------------------------------------------------
+      for (c = 2; c <= p; c++) {
+         cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) ;
+         cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) ;
+         cell_coord(3,c) = c-1;
+      }
+
+//---------------------------------------------------------------------
+//     offset all the coordinates by 1 to adjust for Fortran arrays
+//---------------------------------------------------------------------
+      for (dir = 1; dir <= 3; dir++) {
+         for (c = 1; c <= p; c++) {
+            cell_coord(dir,c) = cell_coord(dir,c) + 1;
+         }
+      }
+      
+//---------------------------------------------------------------------
+//     slice(dir,n) contains the sequence number of the cell that is in
+//     coordinate plane n in the dir direction
+//---------------------------------------------------------------------
+      for (dir = 1; dir <= 3; dir++) {
+         for (c = 1; c <= p; c++) {
+            slice(dir,cell_coord(dir,c)) = c;
+         }
+      }
+
+
+//---------------------------------------------------------------------
+//     fill the predecessor and successor entries, using the indices 
+//     of the bottom cells (they are the same at each level of k 
+//     anyway) acting as if full periodicity pertains; note that p is
+//     added to those arguments to the mod functions that might
+//     otherwise return wrong values when using the modulo function
+//---------------------------------------------------------------------
+      i = cell_coord(1,1)-1;
+      j = cell_coord(2,1)-1;
+
+      predecessor(1) = mod(i-1+p,p) + p*j;
+      predecessor(2) = i + p*mod(j-1+p,p);
+      predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p);
+      successor(1)   = mod(i+1,p) + p*j;
+      successor(2)   = i + p*mod(j+1,p);
+      successor(3)   = mod(i-1+p,p) + p*mod(j+1,p);
+
+//---------------------------------------------------------------------
+//     now compute the sizes of the cells                                
+//---------------------------------------------------------------------
+      for (dir = 1; dir <= 3; dir++) {
+//---------------------------------------------------------------------
+//     set cell_coord range for each direction                           
+//---------------------------------------------------------------------
+         size   = grid_points(dir)/p;
+         excess = mod(grid_points(dir),p);
+         for (c = 1; c <= ncells; c++) {
+            if (cell_coord(dir,c) <= excess) {
+               cell_size(dir,c) = size+1;
+               cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1);
+               cell_high(dir,c) = cell_low(dir,c)+size;
+            } else {
+               cell_size(dir,c) = size;
+               cell_low(dir,c)  = excess*(size+1)+
+                    (cell_coord(dir,c)-excess-1)*size;
+               cell_high(dir,c) = cell_low(dir,c)+size-1;
+            }
+            if (cell_size(dir, c) <= 2) {
+               printf(" Error: Cell size too small. Min size is 3\n");
+               ierrcode = 1;
+               exit(1);
+            }
+         }
+      }
+
+      return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+
+void make_color() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     This function determines cycles in the communication graphs in
+//     the six coordinate directions, and colors the ranks so they know
+//     how to construct deadlock-free blocking communication schedules
+//---------------------------------------------------------------------
+
+      int p, i, j, dir, node_loc, comm_color, node_min, length, start_found;
+
+//---------------------------------------------------------------------
+//     compute square root; add small number to allow for roundoff
+//     (note: this is computed in setup_mpi.f also, but prefer to do
+//     it twice because of some include file problems).
+//---------------------------------------------------------------------
+      ncells = (int)(sqrt((double)(no_nodes) + 0.00001e0));
+
+//---------------------------------------------------------------------
+//     this makes coding easier
+//---------------------------------------------------------------------
+      p = ncells;
+
+      for (dir = 0; dir<6; dir++) {
+
+        node_loc = node_min = node; length = 1; start_found = 0;
+        while (!start_found) {
+          i = mod(node_loc,p) ;
+          j = node_loc/p ;
+
+          switch (dir) {
+            case (WESTDIR):   node_loc = mod(i-1+p,p) + p*j;          break;
+            case (EASTDIR):   node_loc = mod(i+1,p) + p*j;            break;
+            case (SOUTHDIR):  node_loc = i + p*mod(j-1+p,p);          break;
+            case (NORTHDIR):  node_loc = i + p*mod(j+1,p);            break;
+            case (BOTTOMDIR): node_loc = mod(i+1,p) + p*mod(j-1+p,p); break;
+            case (TOPDIR):    node_loc = mod(i-1+p,p) + p*mod(j+1,p); break;
+          }
+
+          // the next block ensures that the node with the lowest rank
+          // in this cycle is colored WHITE (=0), and that nodes an even
+          // number of jumps removed from that lowest-ranked member
+          // are also white. The others are RED (1).
+          if (node_loc <= node_min) {
+            node_min = node_loc;
+            comm_color = 0;
+          } else comm_color = !comm_color;
+          if (node_loc == node) start_found = 1;
+          else length++;
+        }
+        send_color[dir] = comm_color;
+        recv_color[dir] = !send_color[dir];
+        // if the number of nodes in this cycle is odd, we need to treat the 
+        // last node before the "start" of the cycle differently
+        if (length%2) {
+          if (node == node_min) recv_color[dir] = 2;
+          i = mod(node,p) ;
+          j = node/p ;
+          switch (dir) {
+            case (WESTDIR):   node_loc = mod(i-1+p,p) + p*j;          break;
+            case (EASTDIR):   node_loc = mod(i+1,p) + p*j;            break;
+            case (SOUTHDIR):  node_loc = i + p*mod(j-1+p,p);          break;
+            case (NORTHDIR):  node_loc = i + p*mod(j+1,p);            break;
+            case (BOTTOMDIR): node_loc = mod(i+1,p) + p*mod(j-1+p,p); break;
+            case (TOPDIR):    node_loc = mod(i-1+p,p) + p*mod(j+1,p); break;
+          }      
+          if (node_loc == node_min) send_color[dir] = 2;
+        }
+      }
+     return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+

+ 34 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/mpinpb.h.svn-base

@@ -0,0 +1,34 @@
+
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#ifndef __MPINPB_H
+#define __MPINPB_H
+
+#ifdef G_MAIN
+       int           node, no_nodes, total_nodes, root;
+       int           active;
+#else
+extern int           node, no_nodes, total_nodes, root;
+extern int           active;
+
+#endif
+#ifdef _OPENMP
+#pragma omp threadprivate (node, no_nodes, total_nodes, root, active)
+#endif
+#endif
+

+ 104 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/print_results.c.svn-base

@@ -0,0 +1,104 @@
+/*****************************************************************/
+/******     C  _  P  R  I  N  T  _  R  E  S  U  L  T  S     ******/
+/*****************************************************************/
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+#include <stdlib.h>
+#include <stdio.h>
+#define class _class_
+
+void print_results( char   *name,
+                      char   class,
+                      int    n1, 
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      int    nprocs_compiled,
+                      int    nprocs_total,
+                      double t,
+                      double mops,
+		      char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *mpicc,
+                      char   *clink,
+                      char   *cmpi_lib,
+                      char   *cmpi_inc,
+                      char   *cflags,
+                      char   *clinkflags )
+{
+    char *evalue="1000";
+
+    printf( "\n\n %s Benchmark Completed\n", name ); 
+
+    printf( " Class           =                        %c\n", class );
+
+    printf( " Size            =            %3dx %3dx %3d\n", n1,n2,n3 );
+
+    printf( " Iterations      =             %12d\n", niter );
+ 
+    printf( " Time in seconds =             %12.2f\n", t );
+
+    printf( " Total processes =             %12d\n", nprocs_total );
+
+    if ( nprocs_compiled != 0 )
+        printf( " Compiled procs  =             %12d\n", nprocs_compiled );
+
+    printf( " Mop/s total     =             %12.2f\n", mops );
+
+    printf( " Mop/s/process   =             %12.2f\n", mops/((float) nprocs_total) );
+
+    printf( " Operation type  = %24s\n", optype);
+
+    if( passed_verification )
+        printf( " Verification    =               SUCCESSFUL\n" );
+    else
+        printf( " Verification    =             UNSUCCESSFUL\n" );
+
+    printf( " Version         =             %12s\n", npbversion );
+
+    printf( " Compile date    =             %12s\n", compiletime );
+
+    printf( "\n Compile options:\n" );
+
+    printf( "    MPICC        = %s\n", mpicc );
+
+    printf( "    CLINK        = %s\n", clink );
+
+    printf( "    CMPI_LIB     = %s\n", cmpi_lib );
+
+    printf( "    CMPI_INC     = %s\n", cmpi_inc );
+
+    printf( "    CFLAGS       = %s\n", cflags );
+
+    printf( "    CLINKFLAGS   = %s\n", clinkflags );
+#ifdef SMP
+    evalue = getenv("MP_SET_NUMTHREADS");
+    printf( "   MULTICPUS = %s\n", evalue );
+#endif
+
+    printf( "\n\n" );
+    printf( " Please send the results of this run to:\n\n" );
+    printf( " NPB Development Team\n" );
+    printf( " Internet: npb@nas.nasa.gov\n \n" );
+    printf( " If email is not available, send this to:\n\n" );
+    printf( " MS T27A-1\n" );
+    printf( " NASA Ames Research Center\n" );
+    printf( " Moffett Field, CA  94035-1000\n\n" );
+    printf( " Fax: 650-604-3957\n\n" );
+}
+ 

+ 439 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/rhs.c.svn-base

@@ -0,0 +1,439 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void compute_rhs() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+      int c, i, j, k, m;
+      double rho_inv, uijk, up1, um1, vijk, vp1, vm1,
+           wijk, wp1, wm1;
+
+
+//---------------------------------------------------------------------
+//     loop over all cells owned by this node                           
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+
+//---------------------------------------------------------------------
+//     compute the reciprocal of density, and the kinetic energy, 
+//     and the speed of sound.
+//---------------------------------------------------------------------
+         for (k = -1; k <= cell_size(3,c); k++) {
+            for (j = -1; j <= cell_size(2,c); j++) {
+               for (i = -1; i <= cell_size(1,c); i++) {
+                  rho_inv = 1.0e0/u(1,i,j,k,c);
+                  rho_i(i,j,k,c) = rho_inv;
+                  us(i,j,k,c) = u(2,i,j,k,c) * rho_inv;
+                  vs(i,j,k,c) = u(3,i,j,k,c) * rho_inv;
+                  ws(i,j,k,c) = u(4,i,j,k,c) * rho_inv;
+                  square(i,j,k,c)     = 0.5e0* (
+                       u(2,i,j,k,c)*u(2,i,j,k,c) + 
+                       u(3,i,j,k,c)*u(3,i,j,k,c) +
+                       u(4,i,j,k,c)*u(4,i,j,k,c) ) * rho_inv;
+                  qs(i,j,k,c) = square(i,j,k,c) * rho_inv;
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+// copy the exact forcing term to the right hand side;  because 
+// this forcing term is known, we can store it on the whole of every 
+// cell,  including the boundary                   
+//---------------------------------------------------------------------
+
+         for (k = 0; k <= cell_size(3,c)-1; k++) {
+            for (j = 0; j <= cell_size(2,c)-1; j++) {
+               for (i = 0; i <= cell_size(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = forcing(m,i,j,k,c);
+                  }
+               }
+            }
+         }
+
+
+//---------------------------------------------------------------------
+//     compute xi-direction fluxes 
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  uijk = us(i,j,k,c);
+                  up1  = us(i+1,j,k,c);
+                  um1  = us(i-1,j,k,c);
+
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dx1tx1 * 
+                       (u(1,i+1,j,k,c) - 2.0e0*u(1,i,j,k,c) + 
+                       u(1,i-1,j,k,c)) -
+                       tx2 * (u(2,i+1,j,k,c) - u(2,i-1,j,k,c));
+
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dx2tx1 * 
+                       (u(2,i+1,j,k,c) - 2.0e0*u(2,i,j,k,c) + 
+                       u(2,i-1,j,k,c)) +
+                       xxcon2*con43 * (up1 - 2.0e0*uijk + um1) -
+                       tx2 * (u(2,i+1,j,k,c)*up1 - 
+                       u(2,i-1,j,k,c)*um1 +
+                       (u(5,i+1,j,k,c)- square(i+1,j,k,c)-
+                       u(5,i-1,j,k,c)+ square(i-1,j,k,c))*
+                       c2);
+
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dx3tx1 * 
+                       (u(3,i+1,j,k,c) - 2.0e0*u(3,i,j,k,c) +
+                       u(3,i-1,j,k,c)) +
+                       xxcon2 * (vs(i+1,j,k,c) - 2.0e0*vs(i,j,k,c) +
+                       vs(i-1,j,k,c)) -
+                       tx2 * (u(3,i+1,j,k,c)*up1 - 
+                       u(3,i-1,j,k,c)*um1);
+
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dx4tx1 * 
+                       (u(4,i+1,j,k,c) - 2.0e0*u(4,i,j,k,c) +
+                       u(4,i-1,j,k,c)) +
+                       xxcon2 * (ws(i+1,j,k,c) - 2.0e0*ws(i,j,k,c) +
+                       ws(i-1,j,k,c)) -
+                       tx2 * (u(4,i+1,j,k,c)*up1 - 
+                       u(4,i-1,j,k,c)*um1);
+
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dx5tx1 * 
+                       (u(5,i+1,j,k,c) - 2.0e0*u(5,i,j,k,c) +
+                       u(5,i-1,j,k,c)) +
+                       xxcon3 * (qs(i+1,j,k,c) - 2.0e0*qs(i,j,k,c) +
+                       qs(i-1,j,k,c)) +
+                       xxcon4 * (up1*up1 -       2.0e0*uijk*uijk + 
+                       um1*um1) +
+                       xxcon5 * (u(5,i+1,j,k,c)*rho_i(i+1,j,k,c) - 
+                       2.0e0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+                       u(5,i-1,j,k,c)*rho_i(i-1,j,k,c)) -
+                       tx2 * ( (c1*u(5,i+1,j,k,c) - 
+                       c2*square(i+1,j,k,c))*up1 -
+                       (c1*u(5,i-1,j,k,c) - 
+                       c2*square(i-1,j,k,c))*um1 );
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     add fourth order xi-direction dissipation               
+//---------------------------------------------------------------------
+         if (start(1,c) > 0) {
+            for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+               for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+                  i = 1;
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+                          ( 5.0e0*u(m,i,j,k,c) - 4.0e0*u(m,i+1,j,k,c) +
+                          u(m,i+2,j,k,c));
+                  }
+
+                  i = 2;
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+                          (-4.0e0*u(m,i-1,j,k,c) + 6.0e0*u(m,i,j,k,c) -
+                          4.0e0*u(m,i+1,j,k,c) + u(m,i+2,j,k,c));
+                  }
+               }
+            }
+         }
+
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = 3*start(1,c); i <= cell_size(1,c)-3*end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+                          (  u(m,i-2,j,k,c) - 4.0e0*u(m,i-1,j,k,c) + 
+                          6.0*u(m,i,j,k,c) - 4.0e0*u(m,i+1,j,k,c) + 
+                          u(m,i+2,j,k,c) );
+                  }
+               }
+            }
+         }
+         
+
+         if (end(1,c) > 0) {
+            for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+               for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+                  i = cell_size(1,c)-3;
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+                          ( u(m,i-2,j,k,c) - 4.0e0*u(m,i-1,j,k,c) + 
+                          6.0e0*u(m,i,j,k,c) - 4.0e0*u(m,i+1,j,k,c) );
+                  }
+
+                  i = cell_size(1,c)-2;
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+                          ( u(m,i-2,j,k,c) - 4.e0*u(m,i-1,j,k,c) +
+                          5.e0*u(m,i,j,k,c) );
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     compute eta-direction fluxes 
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  vijk = vs(i,j,k,c);
+                  vp1  = vs(i,j+1,k,c);
+                  vm1  = vs(i,j-1,k,c);
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dy1ty1 * 
+                       (u(1,i,j+1,k,c) - 2.0e0*u(1,i,j,k,c) + 
+                       u(1,i,j-1,k,c)) -
+                       ty2 * (u(3,i,j+1,k,c) - u(3,i,j-1,k,c));
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dy2ty1 * 
+                       (u(2,i,j+1,k,c) - 2.0e0*u(2,i,j,k,c) + 
+                       u(2,i,j-1,k,c)) +
+                       yycon2 * (us(i,j+1,k,c) - 2.0e0*us(i,j,k,c) + 
+                       us(i,j-1,k,c)) -
+                       ty2 * (u(2,i,j+1,k,c)*vp1 - 
+                       u(2,i,j-1,k,c)*vm1);
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dy3ty1 * 
+                       (u(3,i,j+1,k,c) - 2.0e0*u(3,i,j,k,c) + 
+                       u(3,i,j-1,k,c)) +
+                       yycon2*con43 * (vp1 - 2.0e0*vijk + vm1) -
+                       ty2 * (u(3,i,j+1,k,c)*vp1 - 
+                       u(3,i,j-1,k,c)*vm1 +
+                       (u(5,i,j+1,k,c) - square(i,j+1,k,c) - 
+                       u(5,i,j-1,k,c) + square(i,j-1,k,c))
+                       *c2);
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dy4ty1 * 
+                       (u(4,i,j+1,k,c) - 2.0e0*u(4,i,j,k,c) + 
+                       u(4,i,j-1,k,c)) +
+                       yycon2 * (ws(i,j+1,k,c) - 2.0e0*ws(i,j,k,c) + 
+                       ws(i,j-1,k,c)) -
+                       ty2 * (u(4,i,j+1,k,c)*vp1 - 
+                       u(4,i,j-1,k,c)*vm1);
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dy5ty1 * 
+                       (u(5,i,j+1,k,c) - 2.0e0*u(5,i,j,k,c) + 
+                       u(5,i,j-1,k,c)) +
+                       yycon3 * (qs(i,j+1,k,c) - 2.0e0*qs(i,j,k,c) + 
+                       qs(i,j-1,k,c)) +
+                       yycon4 * (vp1*vp1       - 2.0e0*vijk*vijk + 
+                       vm1*vm1) +
+                       yycon5 * (u(5,i,j+1,k,c)*rho_i(i,j+1,k,c) - 
+                       2.0e0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+                       u(5,i,j-1,k,c)*rho_i(i,j-1,k,c)) -
+                       ty2 * ((c1*u(5,i,j+1,k,c) - 
+                       c2*square(i,j+1,k,c)) * vp1 -
+                       (c1*u(5,i,j-1,k,c) - 
+                       c2*square(i,j-1,k,c)) * vm1);
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     add fourth order eta-direction dissipation         
+//---------------------------------------------------------------------
+         if (start(2,c) > 0) {
+            for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+               j = 1;
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+                          ( 5.0e0*u(m,i,j,k,c) - 4.0e0*u(m,i,j+1,k,c) +
+                          u(m,i,j+2,k,c));
+                  }
+               }
+
+               j = 2;
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+                          (-4.0e0*u(m,i,j-1,k,c) + 6.0e0*u(m,i,j,k,c) -
+                          4.0e0*u(m,i,j+1,k,c) + u(m,i,j+2,k,c));
+                  }
+               }
+            }
+         }
+
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = 3*start(2,c); j <= cell_size(2,c)-3*end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+                          (  u(m,i,j-2,k,c) - 4.0e0*u(m,i,j-1,k,c) + 
+                          6.0*u(m,i,j,k,c) - 4.0e0*u(m,i,j+1,k,c) + 
+                          u(m,i,j+2,k,c) );
+                  }
+               }
+            }
+         }
+         
+         if (end(2,c) > 0) {
+            for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+               j = cell_size(2,c)-3;
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+                          ( u(m,i,j-2,k,c) - 4.0e0*u(m,i,j-1,k,c) + 
+                          6.0e0*u(m,i,j,k,c) - 4.0e0*u(m,i,j+1,k,c) );
+                  }
+               }
+
+               j = cell_size(2,c)-2;
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+                          ( u(m,i,j-2,k,c) - 4.e0*u(m,i,j-1,k,c) +
+                          5.e0*u(m,i,j,k,c) );
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     compute zeta-direction fluxes 
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  wijk = ws(i,j,k,c);
+                  wp1  = ws(i,j,k+1,c);
+                  wm1  = ws(i,j,k-1,c);
+
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dz1tz1 * 
+                       (u(1,i,j,k+1,c) - 2.0e0*u(1,i,j,k,c) + 
+                       u(1,i,j,k-1,c)) -
+                       tz2 * (u(4,i,j,k+1,c) - u(4,i,j,k-1,c));
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dz2tz1 * 
+                       (u(2,i,j,k+1,c) - 2.0e0*u(2,i,j,k,c) + 
+                       u(2,i,j,k-1,c)) +
+                       zzcon2 * (us(i,j,k+1,c) - 2.0e0*us(i,j,k,c) + 
+                       us(i,j,k-1,c)) -
+                       tz2 * (u(2,i,j,k+1,c)*wp1 - 
+                       u(2,i,j,k-1,c)*wm1);
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dz3tz1 * 
+                       (u(3,i,j,k+1,c) - 2.0e0*u(3,i,j,k,c) + 
+                       u(3,i,j,k-1,c)) +
+                       zzcon2 * (vs(i,j,k+1,c) - 2.0e0*vs(i,j,k,c) + 
+                       vs(i,j,k-1,c)) -
+                       tz2 * (u(3,i,j,k+1,c)*wp1 - 
+                       u(3,i,j,k-1,c)*wm1);
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dz4tz1 * 
+                       (u(4,i,j,k+1,c) - 2.0e0*u(4,i,j,k,c) + 
+                       u(4,i,j,k-1,c)) +
+                       zzcon2*con43 * (wp1 - 2.0e0*wijk + wm1) -
+                       tz2 * (u(4,i,j,k+1,c)*wp1 - 
+                       u(4,i,j,k-1,c)*wm1 +
+                       (u(5,i,j,k+1,c) - square(i,j,k+1,c) - 
+                       u(5,i,j,k-1,c) + square(i,j,k-1,c))
+                       *c2);
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dz5tz1 * 
+                       (u(5,i,j,k+1,c) - 2.0e0*u(5,i,j,k,c) + 
+                       u(5,i,j,k-1,c)) +
+                       zzcon3 * (qs(i,j,k+1,c) - 2.0e0*qs(i,j,k,c) + 
+                       qs(i,j,k-1,c)) +
+                       zzcon4 * (wp1*wp1 - 2.0e0*wijk*wijk + 
+                       wm1*wm1) +
+                       zzcon5 * (u(5,i,j,k+1,c)*rho_i(i,j,k+1,c) - 
+                       2.0e0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+                       u(5,i,j,k-1,c)*rho_i(i,j,k-1,c)) -
+                       tz2 * ( (c1*u(5,i,j,k+1,c) - 
+                       c2*square(i,j,k+1,c))*wp1 -
+                       (c1*u(5,i,j,k-1,c) - 
+                       c2*square(i,j,k-1,c))*wm1);
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     add fourth order zeta-direction dissipation                
+//---------------------------------------------------------------------
+         if (start(3,c) > 0) {
+            k = 1;
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+                          ( 5.0e0*u(m,i,j,k,c) - 4.0e0*u(m,i,j,k+1,c) +
+                          u(m,i,j,k+2,c));
+                  }
+               }
+            }
+
+            k = 2;
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+                          (-4.0e0*u(m,i,j,k-1,c) + 6.0e0*u(m,i,j,k,c) -
+                          4.0e0*u(m,i,j,k+1,c) + u(m,i,j,k+2,c));
+                  }
+               }
+            }
+         }
+
+         for (k = 3*start(3,c); k <= cell_size(3,c)-3*end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+                          (  u(m,i,j,k-2,c) - 4.0e0*u(m,i,j,k-1,c) + 
+                          6.0*u(m,i,j,k,c) - 4.0e0*u(m,i,j,k+1,c) + 
+                          u(m,i,j,k+2,c) );
+                  }
+               }
+            }
+         }
+         
+         if (end(3,c) > 0) {
+            k = cell_size(3,c)-3;
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+                          ( u(m,i,j,k-2,c) - 4.0e0*u(m,i,j,k-1,c) + 
+                          6.0e0*u(m,i,j,k,c) - 4.0e0*u(m,i,j,k+1,c) );
+                  }
+               }
+            }
+
+            k = cell_size(3,c)-2;
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+                          ( u(m,i,j,k-2,c) - 4.e0*u(m,i,j,k-1,c) +
+                          5.e0*u(m,i,j,k,c) );
+                  }
+               }
+            }
+         }
+
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) * dt;
+                  }
+               }
+            }
+         }
+
+      }
+      
+      return;
+}
+
+
+
+

+ 220 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/set_constants.c.svn-base

@@ -0,0 +1,220 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include <math.h>
+#include "header.h"
+
+#define dmax1(x,y) ((x)>(y)? (x):(y))
+
+void  set_constants() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+      
+      ce(1,1)  = 2.0e0;
+      ce(1,2)  = 0.0e0;
+      ce(1,3)  = 0.0e0;
+      ce(1,4)  = 4.0e0;
+      ce(1,5)  = 5.0e0;
+      ce(1,6)  = 3.0e0;
+      ce(1,7)  = 0.5e0;
+      ce(1,8)  = 0.02e0;
+      ce(1,9)  = 0.01e0;
+      ce(1,10) = 0.03e0;
+      ce(1,11) = 0.5e0;
+      ce(1,12) = 0.4e0;
+      ce(1,13) = 0.3e0;
+      
+      ce(2,1)  = 1.0e0;
+      ce(2,2)  = 0.0e0;
+      ce(2,3)  = 0.0e0;
+      ce(2,4)  = 0.0e0;
+      ce(2,5)  = 1.0e0;
+      ce(2,6)  = 2.0e0;
+      ce(2,7)  = 3.0e0;
+      ce(2,8)  = 0.01e0;
+      ce(2,9)  = 0.03e0;
+      ce(2,10) = 0.02e0;
+      ce(2,11) = 0.4e0;
+      ce(2,12) = 0.3e0;
+      ce(2,13) = 0.5e0;
+
+      ce(3,1)  = 2.0e0;
+      ce(3,2)  = 2.0e0;
+      ce(3,3)  = 0.0e0;
+      ce(3,4)  = 0.0e0;
+      ce(3,5)  = 0.0e0;
+      ce(3,6)  = 2.0e0;
+      ce(3,7)  = 3.0e0;
+      ce(3,8)  = 0.04e0;
+      ce(3,9)  = 0.03e0;
+      ce(3,10) = 0.05e0;
+      ce(3,11) = 0.3e0;
+      ce(3,12) = 0.5e0;
+      ce(3,13) = 0.4e0;
+
+      ce(4,1)  = 2.0e0;
+      ce(4,2)  = 2.0e0;
+      ce(4,3)  = 0.0e0;
+      ce(4,4)  = 0.0e0;
+      ce(4,5)  = 0.0e0;
+      ce(4,6)  = 2.0e0;
+      ce(4,7)  = 3.0e0;
+      ce(4,8)  = 0.03e0;
+      ce(4,9)  = 0.05e0;
+      ce(4,10) = 0.04e0;
+      ce(4,11) = 0.2e0;
+      ce(4,12) = 0.1e0;
+      ce(4,13) = 0.3e0;
+
+      ce(5,1)  = 5.0e0;
+      ce(5,2)  = 4.0e0;
+      ce(5,3)  = 3.0e0;
+      ce(5,4)  = 2.0e0;
+      ce(5,5)  = 0.1e0;
+      ce(5,6)  = 0.4e0;
+      ce(5,7)  = 0.3e0;
+      ce(5,8)  = 0.05e0;
+      ce(5,9)  = 0.04e0;
+      ce(5,10) = 0.03e0;
+      ce(5,11) = 0.1e0;
+      ce(5,12) = 0.3e0;
+      ce(5,13) = 0.2e0;
+
+      c1 = 1.4e0;
+      c2 = 0.4e0;
+      c3 = 0.1e0;
+      c4 = 1.0e0;
+      c5 = 1.4e0;
+
+      bt = sqrt(0.5e0);
+
+      dnxm1 = 1.0e0 / (double)(grid_points(1)-1);
+      dnym1 = 1.0e0 / (double)(grid_points(2)-1);
+      dnzm1 = 1.0e0 / (double)(grid_points(3)-1);
+
+      c1c2 = c1 * c2;
+      c1c5 = c1 * c5;
+      c3c4 = c3 * c4;
+      c1345 = c1c5 * c3c4;
+
+      conz1 = (1.0e0-c1c5);
+
+      tx1 = 1.0e0 / (dnxm1 * dnxm1);
+      tx2 = 1.0e0 / (2.0e0 * dnxm1);
+      tx3 = 1.0e0 / dnxm1;
+
+      ty1 = 1.0e0 / (dnym1 * dnym1);
+      ty2 = 1.0e0 / (2.0e0 * dnym1);
+      ty3 = 1.0e0 / dnym1;
+      
+      tz1 = 1.0e0 / (dnzm1 * dnzm1);
+      tz2 = 1.0e0 / (2.0e0 * dnzm1);
+      tz3 = 1.0e0 / dnzm1;
+
+      dx1 = 0.75e0;
+      dx2 = 0.75e0;
+      dx3 = 0.75e0;
+      dx4 = 0.75e0;
+      dx5 = 0.75e0;
+
+      dy1 = 0.75e0;
+      dy2 = 0.75e0;
+      dy3 = 0.75e0;
+      dy4 = 0.75e0;
+      dy5 = 0.75e0;
+
+      dz1 = 1.0e0;
+      dz2 = 1.0e0;
+      dz3 = 1.0e0;
+      dz4 = 1.0e0;
+      dz5 = 1.0e0;
+
+      dxmax = dmax1(dx3, dx4);
+      dymax = dmax1(dy2, dy4);
+      dzmax = dmax1(dz2, dz3);
+
+      dssp = 0.25e0 * dmax1(dx1, dmax1(dy1, dz1) );
+
+      c4dssp = 4.0e0 * dssp;
+      c5dssp = 5.0e0 * dssp;
+
+      dttx1 = dt*tx1;
+      dttx2 = dt*tx2;
+      dtty1 = dt*ty1;
+      dtty2 = dt*ty2;
+      dttz1 = dt*tz1;
+      dttz2 = dt*tz2;
+
+      c2dttx1 = 2.0e0*dttx1;
+      c2dtty1 = 2.0e0*dtty1;
+      c2dttz1 = 2.0e0*dttz1;
+
+      dtdssp = dt*dssp;
+
+      comz1  = dtdssp;
+      comz4  = 4.0e0*dtdssp;
+      comz5  = 5.0e0*dtdssp;
+      comz6  = 6.0e0*dtdssp;
+
+      c3c4tx3 = c3c4*tx3;
+      c3c4ty3 = c3c4*ty3;
+      c3c4tz3 = c3c4*tz3;
+
+      dx1tx1 = dx1*tx1;
+      dx2tx1 = dx2*tx1;
+      dx3tx1 = dx3*tx1;
+      dx4tx1 = dx4*tx1;
+      dx5tx1 = dx5*tx1;
+      
+      dy1ty1 = dy1*ty1;
+      dy2ty1 = dy2*ty1;
+      dy3ty1 = dy3*ty1;
+      dy4ty1 = dy4*ty1;
+      dy5ty1 = dy5*ty1;
+      
+      dz1tz1 = dz1*tz1;
+      dz2tz1 = dz2*tz1;
+      dz3tz1 = dz3*tz1;
+      dz4tz1 = dz4*tz1;
+      dz5tz1 = dz5*tz1;
+
+      c2iv  = 2.5e0;
+      con43 = 4.0e0/3.0e0;
+      con16 = 1.0e0/6.0e0;
+      
+      xxcon1 = c3c4tx3*con43*tx3;
+      xxcon2 = c3c4tx3*tx3;
+      xxcon3 = c3c4tx3*conz1*tx3;
+      xxcon4 = c3c4tx3*con16*tx3;
+      xxcon5 = c3c4tx3*c1c5*tx3;
+
+      yycon1 = c3c4ty3*con43*ty3;
+      yycon2 = c3c4ty3*ty3;
+      yycon3 = c3c4ty3*conz1*ty3;
+      yycon4 = c3c4ty3*con16*ty3;
+      yycon5 = c3c4ty3*c1c5*ty3;
+
+      zzcon1 = c3c4tz3*con43*tz3;
+      zzcon2 = c3c4tz3*tz3;
+      zzcon3 = c3c4tz3*conz1*tz3;
+      zzcon4 = c3c4tz3*con16*tz3;
+      zzcon5 = c3c4tz3*c1c5*tz3;
+
+      return;
+}

+ 60 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/setup_mpi.c.svn-base

@@ -0,0 +1,60 @@
+
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include <math.h>
+#include "mpinpb.h"
+#include "npbparams.h"
+#include "RCCE.h"
+
+int setup_mpi(int *argc, char **argv[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+// set up MPI stuff
+//---------------------------------------------------------------------
+      int error, color, nc;
+
+      if (error = RCCE_init(argc, argv)) return(error);
+
+      total_nodes = RCCE_num_ues();
+      node = RCCE_ue();      
+
+//---------------------------------------------------------------------
+//     compute square root; add small number to allow for roundoff
+//---------------------------------------------------------------------
+      nc = (int)(sqrt((double)(total_nodes) + 0.00001e0));
+
+//---------------------------------------------------------------------
+// We handle a non-square number of nodes by making the excess nodes
+// inactive. However, we can never handle more cells than were compiled
+// in. 
+//---------------------------------------------------------------------
+
+      if (nc > MAXCELLS) nc = MAXCELLS;
+      no_nodes = nc*nc;      
+      
+//---------------------------------------------------------------------
+//     let node 0 be the root for the group (there is only one)
+//---------------------------------------------------------------------
+      root = 0;
+
+      return(0);
+}
+

+ 647 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/solve_subs.c.svn-base

@@ -0,0 +1,647 @@
+
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#define ablock(m,n) ablock[(m-1)+5*(n-1)]
+#define bblock(m,n) bblock[(m-1)+5*(n-1)]
+#define cblock(m,n) cblock[(m-1)+5*(n-1)]
+#define avec(m) avec[m-1]
+#define bvec(m) bvec[m-1]
+#define lhs(m,n) lhs[(m-1)+5*(n-1)]
+#define c(m,n) c[(m-1)+5*(n-1)]
+#define r(m) r[m-1]
+
+void matvec_sub(double ablock[],double avec[],double bvec[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     subtracts bvec=bvec - ablock*avec
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//            rhs(i,ic,jc,kc,ccell) = rhs(i,ic,jc,kc,ccell) 
+//     $           - lhs(i,1,ablock,ia,ja,ka,acell)*
+//---------------------------------------------------------------------
+         bvec(1) = bvec(1) - ablock(1,1)*avec(1)
+                           - ablock(1,2)*avec(2)
+                           - ablock(1,3)*avec(3)
+                           - ablock(1,4)*avec(4)
+                           - ablock(1,5)*avec(5);
+         bvec(2) = bvec(2) - ablock(2,1)*avec(1)
+                           - ablock(2,2)*avec(2)
+                           - ablock(2,3)*avec(3)
+                           - ablock(2,4)*avec(4)
+                           - ablock(2,5)*avec(5);
+         bvec(3) = bvec(3) - ablock(3,1)*avec(1)
+                           - ablock(3,2)*avec(2)
+                           - ablock(3,3)*avec(3)
+                           - ablock(3,4)*avec(4)
+                           - ablock(3,5)*avec(5);
+         bvec(4) = bvec(4) - ablock(4,1)*avec(1)
+                           - ablock(4,2)*avec(2)
+                           - ablock(4,3)*avec(3)
+                           - ablock(4,4)*avec(4)
+                           - ablock(4,5)*avec(5);
+         bvec(5) = bvec(5) - ablock(5,1)*avec(1)
+                           - ablock(5,2)*avec(2)
+                           - ablock(5,3)*avec(3)
+                           - ablock(5,4)*avec(4)
+                           - ablock(5,5)*avec(5);
+
+
+      return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void matmul_sub(double ablock[], double bblock[], double cblock[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     subtracts a(i,j,k) X b(i,j,k) from c(i,j,k)
+//---------------------------------------------------------------------
+
+
+         cblock(1,1) = cblock(1,1) - ablock(1,1)*bblock(1,1)
+                                   - ablock(1,2)*bblock(2,1)
+                                   - ablock(1,3)*bblock(3,1)
+                                   - ablock(1,4)*bblock(4,1)
+                                   - ablock(1,5)*bblock(5,1);
+         cblock(2,1) = cblock(2,1) - ablock(2,1)*bblock(1,1)
+                                   - ablock(2,2)*bblock(2,1)
+                                   - ablock(2,3)*bblock(3,1)
+                                   - ablock(2,4)*bblock(4,1)
+                                   - ablock(2,5)*bblock(5,1);
+         cblock(3,1) = cblock(3,1) - ablock(3,1)*bblock(1,1)
+                                   - ablock(3,2)*bblock(2,1)
+                                   - ablock(3,3)*bblock(3,1)
+                                   - ablock(3,4)*bblock(4,1)
+                                   - ablock(3,5)*bblock(5,1);
+         cblock(4,1) = cblock(4,1) - ablock(4,1)*bblock(1,1)
+                                   - ablock(4,2)*bblock(2,1)
+                                   - ablock(4,3)*bblock(3,1)
+                                   - ablock(4,4)*bblock(4,1)
+                                   - ablock(4,5)*bblock(5,1);
+         cblock(5,1) = cblock(5,1) - ablock(5,1)*bblock(1,1)
+                                   - ablock(5,2)*bblock(2,1)
+                                   - ablock(5,3)*bblock(3,1)
+                                   - ablock(5,4)*bblock(4,1)
+                                   - ablock(5,5)*bblock(5,1);
+         cblock(1,2) = cblock(1,2) - ablock(1,1)*bblock(1,2)
+                                   - ablock(1,2)*bblock(2,2)
+                                   - ablock(1,3)*bblock(3,2)
+                                   - ablock(1,4)*bblock(4,2)
+                                   - ablock(1,5)*bblock(5,2);
+         cblock(2,2) = cblock(2,2) - ablock(2,1)*bblock(1,2)
+                                   - ablock(2,2)*bblock(2,2)
+                                   - ablock(2,3)*bblock(3,2)
+                                   - ablock(2,4)*bblock(4,2)
+                                   - ablock(2,5)*bblock(5,2);
+         cblock(3,2) = cblock(3,2) - ablock(3,1)*bblock(1,2)
+                                   - ablock(3,2)*bblock(2,2)
+                                   - ablock(3,3)*bblock(3,2)
+                                   - ablock(3,4)*bblock(4,2)
+                                   - ablock(3,5)*bblock(5,2);
+         cblock(4,2) = cblock(4,2) - ablock(4,1)*bblock(1,2)
+                                   - ablock(4,2)*bblock(2,2)
+                                   - ablock(4,3)*bblock(3,2)
+                                   - ablock(4,4)*bblock(4,2)
+                                   - ablock(4,5)*bblock(5,2);
+         cblock(5,2) = cblock(5,2) - ablock(5,1)*bblock(1,2)
+                                   - ablock(5,2)*bblock(2,2)
+                                   - ablock(5,3)*bblock(3,2)
+                                   - ablock(5,4)*bblock(4,2)
+                                   - ablock(5,5)*bblock(5,2);
+         cblock(1,3) = cblock(1,3) - ablock(1,1)*bblock(1,3)
+                                   - ablock(1,2)*bblock(2,3)
+                                   - ablock(1,3)*bblock(3,3)
+                                   - ablock(1,4)*bblock(4,3)
+                                   - ablock(1,5)*bblock(5,3);
+         cblock(2,3) = cblock(2,3) - ablock(2,1)*bblock(1,3)
+                                   - ablock(2,2)*bblock(2,3)
+                                   - ablock(2,3)*bblock(3,3)
+                                   - ablock(2,4)*bblock(4,3)
+                                   - ablock(2,5)*bblock(5,3);
+         cblock(3,3) = cblock(3,3) - ablock(3,1)*bblock(1,3)
+                                   - ablock(3,2)*bblock(2,3)
+                                   - ablock(3,3)*bblock(3,3)
+                                   - ablock(3,4)*bblock(4,3)
+                                   - ablock(3,5)*bblock(5,3);
+         cblock(4,3) = cblock(4,3) - ablock(4,1)*bblock(1,3)
+                                   - ablock(4,2)*bblock(2,3)
+                                   - ablock(4,3)*bblock(3,3)
+                                   - ablock(4,4)*bblock(4,3)
+                                   - ablock(4,5)*bblock(5,3);
+         cblock(5,3) = cblock(5,3) - ablock(5,1)*bblock(1,3)
+                                   - ablock(5,2)*bblock(2,3)
+                                   - ablock(5,3)*bblock(3,3)
+                                   - ablock(5,4)*bblock(4,3)
+                                   - ablock(5,5)*bblock(5,3);
+         cblock(1,4) = cblock(1,4) - ablock(1,1)*bblock(1,4)
+                                   - ablock(1,2)*bblock(2,4)
+                                   - ablock(1,3)*bblock(3,4)
+                                   - ablock(1,4)*bblock(4,4)
+                                   - ablock(1,5)*bblock(5,4);
+         cblock(2,4) = cblock(2,4) - ablock(2,1)*bblock(1,4)
+                                   - ablock(2,2)*bblock(2,4)
+                                   - ablock(2,3)*bblock(3,4)
+                                   - ablock(2,4)*bblock(4,4)
+                                   - ablock(2,5)*bblock(5,4);
+         cblock(3,4) = cblock(3,4) - ablock(3,1)*bblock(1,4)
+                                   - ablock(3,2)*bblock(2,4)
+                                   - ablock(3,3)*bblock(3,4)
+                                   - ablock(3,4)*bblock(4,4)
+                                   - ablock(3,5)*bblock(5,4);
+         cblock(4,4) = cblock(4,4) - ablock(4,1)*bblock(1,4)
+                                   - ablock(4,2)*bblock(2,4)
+                                   - ablock(4,3)*bblock(3,4)
+                                   - ablock(4,4)*bblock(4,4)
+                                   - ablock(4,5)*bblock(5,4);
+         cblock(5,4) = cblock(5,4) - ablock(5,1)*bblock(1,4)
+                                   - ablock(5,2)*bblock(2,4)
+                                   - ablock(5,3)*bblock(3,4)
+                                   - ablock(5,4)*bblock(4,4)
+                                   - ablock(5,5)*bblock(5,4);
+         cblock(1,5) = cblock(1,5) - ablock(1,1)*bblock(1,5)
+                                   - ablock(1,2)*bblock(2,5)
+                                   - ablock(1,3)*bblock(3,5)
+                                   - ablock(1,4)*bblock(4,5)
+                                   - ablock(1,5)*bblock(5,5);
+         cblock(2,5) = cblock(2,5) - ablock(2,1)*bblock(1,5)
+                                   - ablock(2,2)*bblock(2,5)
+                                   - ablock(2,3)*bblock(3,5)
+                                   - ablock(2,4)*bblock(4,5)
+                                   - ablock(2,5)*bblock(5,5);
+         cblock(3,5) = cblock(3,5) - ablock(3,1)*bblock(1,5)
+                                   - ablock(3,2)*bblock(2,5)
+                                   - ablock(3,3)*bblock(3,5)
+                                   - ablock(3,4)*bblock(4,5)
+                                   - ablock(3,5)*bblock(5,5);
+         cblock(4,5) = cblock(4,5) - ablock(4,1)*bblock(1,5)
+                                   - ablock(4,2)*bblock(2,5)
+                                   - ablock(4,3)*bblock(3,5)
+                                   - ablock(4,4)*bblock(4,5)
+                                   - ablock(4,5)*bblock(5,5);
+         cblock(5,5) = cblock(5,5) - ablock(5,1)*bblock(1,5)
+                                   - ablock(5,2)*bblock(2,5)
+                                   - ablock(5,3)*bblock(3,5)
+                                   - ablock(5,4)*bblock(4,5)
+                                   - ablock(5,5)*bblock(5,5);
+
+              
+      return;
+}
+
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void binvcrhs( double lhs[],double c[],double r[] ) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     
+//---------------------------------------------------------------------
+
+      double pivot, coeff;
+
+//---------------------------------------------------------------------
+//     
+//---------------------------------------------------------------------
+
+      pivot = 1.00e0/lhs(1,1);
+      lhs(1,2) = lhs(1,2)*pivot;
+      lhs(1,3) = lhs(1,3)*pivot;
+      lhs(1,4) = lhs(1,4)*pivot;
+      lhs(1,5) = lhs(1,5)*pivot;
+      c(1,1) = c(1,1)*pivot;
+      c(1,2) = c(1,2)*pivot;
+      c(1,3) = c(1,3)*pivot;
+      c(1,4) = c(1,4)*pivot;
+      c(1,5) = c(1,5)*pivot;
+      r(1)   = r(1)  *pivot;
+
+      coeff = lhs(2,1);
+      lhs(2,2)= lhs(2,2) - coeff*lhs(1,2);
+      lhs(2,3)= lhs(2,3) - coeff*lhs(1,3);
+      lhs(2,4)= lhs(2,4) - coeff*lhs(1,4);
+      lhs(2,5)= lhs(2,5) - coeff*lhs(1,5);
+      c(2,1) = c(2,1) - coeff*c(1,1);
+      c(2,2) = c(2,2) - coeff*c(1,2);
+      c(2,3) = c(2,3) - coeff*c(1,3);
+      c(2,4) = c(2,4) - coeff*c(1,4);
+      c(2,5) = c(2,5) - coeff*c(1,5);
+      r(2)   = r(2)   - coeff*r(1);
+
+      coeff = lhs(3,1);
+      lhs(3,2)= lhs(3,2) - coeff*lhs(1,2);
+      lhs(3,3)= lhs(3,3) - coeff*lhs(1,3);
+      lhs(3,4)= lhs(3,4) - coeff*lhs(1,4);
+      lhs(3,5)= lhs(3,5) - coeff*lhs(1,5);
+      c(3,1) = c(3,1) - coeff*c(1,1);
+      c(3,2) = c(3,2) - coeff*c(1,2);
+      c(3,3) = c(3,3) - coeff*c(1,3);
+      c(3,4) = c(3,4) - coeff*c(1,4);
+      c(3,5) = c(3,5) - coeff*c(1,5);
+      r(3)   = r(3)   - coeff*r(1);
+
+      coeff = lhs(4,1);
+      lhs(4,2)= lhs(4,2) - coeff*lhs(1,2);
+      lhs(4,3)= lhs(4,3) - coeff*lhs(1,3);
+      lhs(4,4)= lhs(4,4) - coeff*lhs(1,4);
+      lhs(4,5)= lhs(4,5) - coeff*lhs(1,5);
+      c(4,1) = c(4,1) - coeff*c(1,1);
+      c(4,2) = c(4,2) - coeff*c(1,2);
+      c(4,3) = c(4,3) - coeff*c(1,3);
+      c(4,4) = c(4,4) - coeff*c(1,4);
+      c(4,5) = c(4,5) - coeff*c(1,5);
+      r(4)   = r(4)   - coeff*r(1);
+
+      coeff = lhs(5,1);
+      lhs(5,2)= lhs(5,2) - coeff*lhs(1,2);
+      lhs(5,3)= lhs(5,3) - coeff*lhs(1,3);
+      lhs(5,4)= lhs(5,4) - coeff*lhs(1,4);
+      lhs(5,5)= lhs(5,5) - coeff*lhs(1,5);
+      c(5,1) = c(5,1) - coeff*c(1,1);
+      c(5,2) = c(5,2) - coeff*c(1,2);
+      c(5,3) = c(5,3) - coeff*c(1,3);
+      c(5,4) = c(5,4) - coeff*c(1,4);
+      c(5,5) = c(5,5) - coeff*c(1,5);
+      r(5)   = r(5)   - coeff*r(1);
+
+
+      pivot = 1.00e0/lhs(2,2);
+      lhs(2,3) = lhs(2,3)*pivot;
+      lhs(2,4) = lhs(2,4)*pivot;
+      lhs(2,5) = lhs(2,5)*pivot;
+      c(2,1) = c(2,1)*pivot;
+      c(2,2) = c(2,2)*pivot;
+      c(2,3) = c(2,3)*pivot;
+      c(2,4) = c(2,4)*pivot;
+      c(2,5) = c(2,5)*pivot;
+      r(2)   = r(2)  *pivot;
+
+      coeff = lhs(1,2);
+      lhs(1,3)= lhs(1,3) - coeff*lhs(2,3);
+      lhs(1,4)= lhs(1,4) - coeff*lhs(2,4);
+      lhs(1,5)= lhs(1,5) - coeff*lhs(2,5);
+      c(1,1) = c(1,1) - coeff*c(2,1);
+      c(1,2) = c(1,2) - coeff*c(2,2);
+      c(1,3) = c(1,3) - coeff*c(2,3);
+      c(1,4) = c(1,4) - coeff*c(2,4);
+      c(1,5) = c(1,5) - coeff*c(2,5);
+      r(1)   = r(1)   - coeff*r(2);
+
+      coeff = lhs(3,2);
+      lhs(3,3)= lhs(3,3) - coeff*lhs(2,3);
+      lhs(3,4)= lhs(3,4) - coeff*lhs(2,4);
+      lhs(3,5)= lhs(3,5) - coeff*lhs(2,5);
+      c(3,1) = c(3,1) - coeff*c(2,1);
+      c(3,2) = c(3,2) - coeff*c(2,2);
+      c(3,3) = c(3,3) - coeff*c(2,3);
+      c(3,4) = c(3,4) - coeff*c(2,4);
+      c(3,5) = c(3,5) - coeff*c(2,5);
+      r(3)   = r(3)   - coeff*r(2);
+
+      coeff = lhs(4,2);
+      lhs(4,3)= lhs(4,3) - coeff*lhs(2,3);
+      lhs(4,4)= lhs(4,4) - coeff*lhs(2,4);
+      lhs(4,5)= lhs(4,5) - coeff*lhs(2,5);
+      c(4,1) = c(4,1) - coeff*c(2,1);
+      c(4,2) = c(4,2) - coeff*c(2,2);
+      c(4,3) = c(4,3) - coeff*c(2,3);
+      c(4,4) = c(4,4) - coeff*c(2,4);
+      c(4,5) = c(4,5) - coeff*c(2,5);
+      r(4)   = r(4)   - coeff*r(2);
+
+      coeff = lhs(5,2);
+      lhs(5,3)= lhs(5,3) - coeff*lhs(2,3);
+      lhs(5,4)= lhs(5,4) - coeff*lhs(2,4);
+      lhs(5,5)= lhs(5,5) - coeff*lhs(2,5);
+      c(5,1) = c(5,1) - coeff*c(2,1);
+      c(5,2) = c(5,2) - coeff*c(2,2);
+      c(5,3) = c(5,3) - coeff*c(2,3);
+      c(5,4) = c(5,4) - coeff*c(2,4);
+      c(5,5) = c(5,5) - coeff*c(2,5);
+      r(5)   = r(5)   - coeff*r(2);
+
+
+      pivot = 1.00e0/lhs(3,3);
+      lhs(3,4) = lhs(3,4)*pivot;
+      lhs(3,5) = lhs(3,5)*pivot;
+      c(3,1) = c(3,1)*pivot;
+      c(3,2) = c(3,2)*pivot;
+      c(3,3) = c(3,3)*pivot;
+      c(3,4) = c(3,4)*pivot;
+      c(3,5) = c(3,5)*pivot;
+      r(3)   = r(3)  *pivot;
+
+      coeff = lhs(1,3);
+      lhs(1,4)= lhs(1,4) - coeff*lhs(3,4);
+      lhs(1,5)= lhs(1,5) - coeff*lhs(3,5);
+      c(1,1) = c(1,1) - coeff*c(3,1);
+      c(1,2) = c(1,2) - coeff*c(3,2);
+      c(1,3) = c(1,3) - coeff*c(3,3);
+      c(1,4) = c(1,4) - coeff*c(3,4);
+      c(1,5) = c(1,5) - coeff*c(3,5);
+      r(1)   = r(1)   - coeff*r(3);
+
+      coeff = lhs(2,3);
+      lhs(2,4)= lhs(2,4) - coeff*lhs(3,4);
+      lhs(2,5)= lhs(2,5) - coeff*lhs(3,5);
+      c(2,1) = c(2,1) - coeff*c(3,1);
+      c(2,2) = c(2,2) - coeff*c(3,2);
+      c(2,3) = c(2,3) - coeff*c(3,3);
+      c(2,4) = c(2,4) - coeff*c(3,4);
+      c(2,5) = c(2,5) - coeff*c(3,5);
+      r(2)   = r(2)   - coeff*r(3);
+
+      coeff = lhs(4,3);
+      lhs(4,4)= lhs(4,4) - coeff*lhs(3,4);
+      lhs(4,5)= lhs(4,5) - coeff*lhs(3,5);
+      c(4,1) = c(4,1) - coeff*c(3,1);
+      c(4,2) = c(4,2) - coeff*c(3,2);
+      c(4,3) = c(4,3) - coeff*c(3,3);
+      c(4,4) = c(4,4) - coeff*c(3,4);
+      c(4,5) = c(4,5) - coeff*c(3,5);
+      r(4)   = r(4)   - coeff*r(3);
+
+      coeff = lhs(5,3);
+      lhs(5,4)= lhs(5,4) - coeff*lhs(3,4);
+      lhs(5,5)= lhs(5,5) - coeff*lhs(3,5);
+      c(5,1) = c(5,1) - coeff*c(3,1);
+      c(5,2) = c(5,2) - coeff*c(3,2);
+      c(5,3) = c(5,3) - coeff*c(3,3);
+      c(5,4) = c(5,4) - coeff*c(3,4);
+      c(5,5) = c(5,5) - coeff*c(3,5);
+      r(5)   = r(5)   - coeff*r(3);
+
+
+      pivot = 1.00e0/lhs(4,4);
+      lhs(4,5) = lhs(4,5)*pivot;
+      c(4,1) = c(4,1)*pivot;
+      c(4,2) = c(4,2)*pivot;
+      c(4,3) = c(4,3)*pivot;
+      c(4,4) = c(4,4)*pivot;
+      c(4,5) = c(4,5)*pivot;
+      r(4)   = r(4)  *pivot;
+
+      coeff = lhs(1,4);
+      lhs(1,5)= lhs(1,5) - coeff*lhs(4,5);
+      c(1,1) = c(1,1) - coeff*c(4,1);
+      c(1,2) = c(1,2) - coeff*c(4,2);
+      c(1,3) = c(1,3) - coeff*c(4,3);
+      c(1,4) = c(1,4) - coeff*c(4,4);
+      c(1,5) = c(1,5) - coeff*c(4,5);
+      r(1)   = r(1)   - coeff*r(4);
+
+      coeff = lhs(2,4);
+      lhs(2,5)= lhs(2,5) - coeff*lhs(4,5);
+      c(2,1) = c(2,1) - coeff*c(4,1);
+      c(2,2) = c(2,2) - coeff*c(4,2);
+      c(2,3) = c(2,3) - coeff*c(4,3);
+      c(2,4) = c(2,4) - coeff*c(4,4);
+      c(2,5) = c(2,5) - coeff*c(4,5);
+      r(2)   = r(2)   - coeff*r(4);
+
+      coeff = lhs(3,4);
+      lhs(3,5)= lhs(3,5) - coeff*lhs(4,5);
+      c(3,1) = c(3,1) - coeff*c(4,1);
+      c(3,2) = c(3,2) - coeff*c(4,2);
+      c(3,3) = c(3,3) - coeff*c(4,3);
+      c(3,4) = c(3,4) - coeff*c(4,4);
+      c(3,5) = c(3,5) - coeff*c(4,5);
+      r(3)   = r(3)   - coeff*r(4);
+
+      coeff = lhs(5,4);
+      lhs(5,5)= lhs(5,5) - coeff*lhs(4,5);
+      c(5,1) = c(5,1) - coeff*c(4,1);
+      c(5,2) = c(5,2) - coeff*c(4,2);
+      c(5,3) = c(5,3) - coeff*c(4,3);
+      c(5,4) = c(5,4) - coeff*c(4,4);
+      c(5,5) = c(5,5) - coeff*c(4,5);
+      r(5)   = r(5)   - coeff*r(4);
+
+
+      pivot = 1.00e0/lhs(5,5);
+      c(5,1) = c(5,1)*pivot;
+      c(5,2) = c(5,2)*pivot;
+      c(5,3) = c(5,3)*pivot;
+      c(5,4) = c(5,4)*pivot;
+      c(5,5) = c(5,5)*pivot;
+      r(5)   = r(5)  *pivot;
+
+      coeff = lhs(1,5);
+      c(1,1) = c(1,1) - coeff*c(5,1);
+      c(1,2) = c(1,2) - coeff*c(5,2);
+      c(1,3) = c(1,3) - coeff*c(5,3);
+      c(1,4) = c(1,4) - coeff*c(5,4);
+      c(1,5) = c(1,5) - coeff*c(5,5);
+      r(1)   = r(1)   - coeff*r(5);
+
+      coeff = lhs(2,5);
+      c(2,1) = c(2,1) - coeff*c(5,1);
+      c(2,2) = c(2,2) - coeff*c(5,2);
+      c(2,3) = c(2,3) - coeff*c(5,3);
+      c(2,4) = c(2,4) - coeff*c(5,4);
+      c(2,5) = c(2,5) - coeff*c(5,5);
+      r(2)   = r(2)   - coeff*r(5);
+
+      coeff = lhs(3,5);
+      c(3,1) = c(3,1) - coeff*c(5,1);
+      c(3,2) = c(3,2) - coeff*c(5,2);
+      c(3,3) = c(3,3) - coeff*c(5,3);
+      c(3,4) = c(3,4) - coeff*c(5,4);
+      c(3,5) = c(3,5) - coeff*c(5,5);
+      r(3)   = r(3)   - coeff*r(5);
+
+      coeff = lhs(4,5);
+      c(4,1) = c(4,1) - coeff*c(5,1);
+      c(4,2) = c(4,2) - coeff*c(5,2);
+      c(4,3) = c(4,3) - coeff*c(5,3);
+      c(4,4) = c(4,4) - coeff*c(5,4);
+      c(4,5) = c(4,5) - coeff*c(5,5);
+      r(4)   = r(4)   - coeff*r(5);
+
+
+      return;
+}
+
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void binvrhs( double lhs[],double r[] ) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     
+//---------------------------------------------------------------------
+
+      double pivot, coeff;
+
+//---------------------------------------------------------------------
+//     
+//---------------------------------------------------------------------
+
+
+      pivot = 1.00e0/lhs(1,1);
+      lhs(1,2) = lhs(1,2)*pivot;
+      lhs(1,3) = lhs(1,3)*pivot;
+      lhs(1,4) = lhs(1,4)*pivot;
+      lhs(1,5) = lhs(1,5)*pivot;
+      r(1)   = r(1)  *pivot;
+
+      coeff = lhs(2,1);
+      lhs(2,2)= lhs(2,2) - coeff*lhs(1,2);
+      lhs(2,3)= lhs(2,3) - coeff*lhs(1,3);
+      lhs(2,4)= lhs(2,4) - coeff*lhs(1,4);
+      lhs(2,5)= lhs(2,5) - coeff*lhs(1,5);
+      r(2)   = r(2)   - coeff*r(1);
+
+      coeff = lhs(3,1);
+      lhs(3,2)= lhs(3,2) - coeff*lhs(1,2);
+      lhs(3,3)= lhs(3,3) - coeff*lhs(1,3);
+      lhs(3,4)= lhs(3,4) - coeff*lhs(1,4);
+      lhs(3,5)= lhs(3,5) - coeff*lhs(1,5);
+      r(3)   = r(3)   - coeff*r(1);
+
+      coeff = lhs(4,1);
+      lhs(4,2)= lhs(4,2) - coeff*lhs(1,2);
+      lhs(4,3)= lhs(4,3) - coeff*lhs(1,3);
+      lhs(4,4)= lhs(4,4) - coeff*lhs(1,4);
+      lhs(4,5)= lhs(4,5) - coeff*lhs(1,5);
+      r(4)   = r(4)   - coeff*r(1);
+
+      coeff = lhs(5,1);
+      lhs(5,2)= lhs(5,2) - coeff*lhs(1,2);
+      lhs(5,3)= lhs(5,3) - coeff*lhs(1,3);
+      lhs(5,4)= lhs(5,4) - coeff*lhs(1,4);
+      lhs(5,5)= lhs(5,5) - coeff*lhs(1,5);
+      r(5)   = r(5)   - coeff*r(1);
+
+
+      pivot = 1.00e0/lhs(2,2);
+      lhs(2,3) = lhs(2,3)*pivot;
+      lhs(2,4) = lhs(2,4)*pivot;
+      lhs(2,5) = lhs(2,5)*pivot;
+      r(2)   = r(2)  *pivot;
+
+      coeff = lhs(1,2);
+      lhs(1,3)= lhs(1,3) - coeff*lhs(2,3);
+      lhs(1,4)= lhs(1,4) - coeff*lhs(2,4);
+      lhs(1,5)= lhs(1,5) - coeff*lhs(2,5);
+      r(1)   = r(1)   - coeff*r(2);
+
+      coeff = lhs(3,2);
+      lhs(3,3)= lhs(3,3) - coeff*lhs(2,3);
+      lhs(3,4)= lhs(3,4) - coeff*lhs(2,4);
+      lhs(3,5)= lhs(3,5) - coeff*lhs(2,5);
+      r(3)   = r(3)   - coeff*r(2);
+
+      coeff = lhs(4,2);
+      lhs(4,3)= lhs(4,3) - coeff*lhs(2,3);
+      lhs(4,4)= lhs(4,4) - coeff*lhs(2,4);
+      lhs(4,5)= lhs(4,5) - coeff*lhs(2,5);
+      r(4)   = r(4)   - coeff*r(2);
+
+      coeff = lhs(5,2);
+      lhs(5,3)= lhs(5,3) - coeff*lhs(2,3);
+      lhs(5,4)= lhs(5,4) - coeff*lhs(2,4);
+      lhs(5,5)= lhs(5,5) - coeff*lhs(2,5);
+      r(5)   = r(5)   - coeff*r(2);
+
+
+      pivot = 1.00e0/lhs(3,3);
+      lhs(3,4) = lhs(3,4)*pivot;
+      lhs(3,5) = lhs(3,5)*pivot;
+      r(3)   = r(3)  *pivot;
+
+      coeff = lhs(1,3);
+      lhs(1,4)= lhs(1,4) - coeff*lhs(3,4);
+      lhs(1,5)= lhs(1,5) - coeff*lhs(3,5);
+      r(1)   = r(1)   - coeff*r(3);
+
+      coeff = lhs(2,3);
+      lhs(2,4)= lhs(2,4) - coeff*lhs(3,4);
+      lhs(2,5)= lhs(2,5) - coeff*lhs(3,5);
+      r(2)   = r(2)   - coeff*r(3);
+
+      coeff = lhs(4,3);
+      lhs(4,4)= lhs(4,4) - coeff*lhs(3,4);
+      lhs(4,5)= lhs(4,5) - coeff*lhs(3,5);
+      r(4)   = r(4)   - coeff*r(3);
+
+      coeff = lhs(5,3);
+      lhs(5,4)= lhs(5,4) - coeff*lhs(3,4);
+      lhs(5,5)= lhs(5,5) - coeff*lhs(3,5);
+      r(5)   = r(5)   - coeff*r(3);
+
+
+      pivot = 1.00e0/lhs(4,4);
+      lhs(4,5) = lhs(4,5)*pivot;
+      r(4)   = r(4)  *pivot;
+
+      coeff = lhs(1,4);
+      lhs(1,5)= lhs(1,5) - coeff*lhs(4,5);
+      r(1)   = r(1)   - coeff*r(4);
+
+      coeff = lhs(2,4);
+      lhs(2,5)= lhs(2,5) - coeff*lhs(4,5);
+      r(2)   = r(2)   - coeff*r(4);
+
+      coeff = lhs(3,4);
+      lhs(3,5)= lhs(3,5) - coeff*lhs(4,5);
+      r(3)   = r(3)   - coeff*r(4);
+
+      coeff = lhs(5,4);
+      lhs(5,5)= lhs(5,5) - coeff*lhs(4,5);
+      r(5)   = r(5)   - coeff*r(4);
+
+
+      pivot = 1.00e0/lhs(5,5);
+      r(5)   = r(5)  *pivot;
+
+      coeff = lhs(1,5);
+      r(1)   = r(1)   - coeff*r(5);
+
+      coeff = lhs(2,5);
+      r(2)   = r(2)   - coeff*r(5);
+
+      coeff = lhs(3,5);
+      r(3)   = r(3)   - coeff*r(5);
+
+      coeff = lhs(4,5);
+      r(4)   = r(4)   - coeff*r(5);
+
+
+      return;
+}
+
+
+

+ 59 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/timers.c.svn-base

@@ -0,0 +1,59 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+#include "RCCE.h" 
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+#include "timers.h"
+#define elapsed(n) elapsed[n-1]
+#define start_time(n)   start_time[n-1]
+      
+void timer_clear(int np){
+      
+      int n = np;
+      elapsed(n) = 0.0;
+      return;
+}
+
+
+void timer_start(int np) {
+
+      int n = np;
+
+      start_time(n) = RCCE_wtime();
+
+      return;
+}
+
+void timer_stop(int np) {
+
+      int n = np;
+
+      double t, now;
+      now = RCCE_wtime();
+      t = now - start_time(n);
+      elapsed(n) = elapsed(n) + t;
+
+      return;
+}
+
+
+double timer_read(int np) {
+
+      int n = np;      
+      return( elapsed(n));
+}
+

+ 4 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/timers.h.svn-base

@@ -0,0 +1,4 @@
+double start_time[64], elapsed[64];
+#ifdef _OPENMP
+#pragma omp threadprivate (start_time, elapsed)
+#endif

+ 380 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/verify.c.svn-base

@@ -0,0 +1,380 @@
+
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include <stdio.h>
+#include "header.h"
+#include "mpinpb.h"
+
+#define FABS(x) ((x)>0 ? (x) : -(x))
+void verify(int no_time_steps, char *class_r, int *verified_r) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//  verification routine                         
+//---------------------------------------------------------------------
+
+        double xcrref[5],xceref[5],xcrdif[5],xcedif[5], 
+               epsilon, xce[5], xcr[5], dtref;
+        int m;
+        char class;
+        int verified;
+#define xcrref(m) xcrref[m-1]
+#define xceref(m) xceref[m-1]
+#define xcrdif(m) xcrdif[m-1]
+#define xcedif(m) xcedif[m-1]
+#define xce(m) xce[m-1]
+#define xcr(m) xcr[m-1]
+
+//---------------------------------------------------------------------
+//   tolerance level
+//---------------------------------------------------------------------
+        epsilon = 1.0e-08;
+        verified = 1;
+
+//---------------------------------------------------------------------
+//   compute the error norm and the residual norm, and exit if not printing
+//---------------------------------------------------------------------
+
+        error_norm(xce);
+
+        copy_faces();
+
+        rhs_norm(xcr);
+
+        for (m = 1; m <= 5; m++) {
+           xcr(m) = xcr(m) / dt;
+        }
+
+        if (node != root) return;
+
+        class = 'U';
+
+        for (m = 1; m <= 5; m++) {
+           xcrref(m) = 1.0;
+           xceref(m) = 1.0;
+        }
+
+//---------------------------------------------------------------------
+//    reference data for 12X12X12 grids after 60 time steps, with DT = 1.0e-02
+//---------------------------------------------------------------------
+        if ( (grid_points(1)  == 12     ) && 
+             (grid_points(2)  == 12     ) &&
+             (grid_points(3)  == 12     ) &&
+             (no_time_steps   == 60    )) {
+
+           class = 'S';
+           dtref = 1.0e-2;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of residual.
+//---------------------------------------------------------------------
+         xcrref(1) = 1.7034283709541311e-01;
+         xcrref(2) = 1.2975252070034097e-02;
+         xcrref(3) = 3.2527926989486055e-02;
+         xcrref(4) = 2.6436421275166801e-02;
+         xcrref(5) = 1.9211784131744430e-01;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of solution error.
+//---------------------------------------------------------------------
+
+           xceref(1) = 4.9976913345811579e-04;
+           xceref(2) = 4.5195666782961927e-05;
+           xceref(3) = 7.3973765172921357e-05;
+           xceref(4) = 7.3821238632439731e-05;
+           xceref(5) = 8.9269630987491446e-04;
+
+//---------------------------------------------------------------------
+//    reference data for 24X24X24 grids after 200 time steps, with DT = 0.8e-3
+//---------------------------------------------------------------------
+        } else if ( (grid_points(1) == 24) && 
+                 (grid_points(2) == 24) &&
+                 (grid_points(3) == 24) &&
+                 (no_time_steps == 200) ) {
+
+           class = 'W';
+           dtref = 0.8e-3;
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of residual.
+//---------------------------------------------------------------------
+           xcrref(1) = 0.1125590409344e+03;
+           xcrref(2) = 0.1180007595731e+02;
+           xcrref(3) = 0.2710329767846e+02;
+           xcrref(4) = 0.2469174937669e+02;
+           xcrref(5) = 0.2638427874317e+03;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of solution error.
+//---------------------------------------------------------------------
+
+           xceref(1) = 0.4419655736008e+01;
+           xceref(2) = 0.4638531260002e+00;
+           xceref(3) = 0.1011551749967e+01;
+           xceref(4) = 0.9235878729944e+00;
+           xceref(5) = 0.1018045837718e+02;
+
+//---------------------------------------------------------------------
+//    reference data for 64X64X64 grids after 200 time steps, with DT = 0.8e-3
+//---------------------------------------------------------------------
+        } else if ( (grid_points(1) == 64) && 
+                 (grid_points(2) == 64) &&
+                 (grid_points(3) == 64) &&
+                 (no_time_steps == 200) ) {
+
+           class = 'A';
+           dtref = 0.8e-3;
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of residual.
+//---------------------------------------------------------------------
+         xcrref(1) = 1.0806346714637264e+02;
+         xcrref(2) = 1.1319730901220813e+01;
+         xcrref(3) = 2.5974354511582465e+01;
+         xcrref(4) = 2.3665622544678910e+01;
+         xcrref(5) = 2.5278963211748344e+02;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of solution error.
+//---------------------------------------------------------------------
+
+           xceref(1) = 4.2348416040525025e+00;
+           xceref(2) = 4.4390282496995698e-01;
+           xceref(3) = 9.6692480136345650e-01;
+           xceref(4) = 8.8302063039765474e-01;
+           xceref(5) = 9.7379901770829278e+00;
+
+//---------------------------------------------------------------------
+//    reference data for 102X102X102 grids after 200 time steps,
+//    with DT = 3.0e-04
+//---------------------------------------------------------------------
+        } else if ( (grid_points(1) == 102) && 
+                 (grid_points(2) == 102) &&
+                 (grid_points(3) == 102) &&
+                 (no_time_steps == 200) ) {
+
+           class = 'B';
+           dtref = 3.0e-4;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of residual.
+//---------------------------------------------------------------------
+         xcrref(1) = 1.4233597229287254e+03;
+         xcrref(2) = 9.9330522590150238e+01;
+         xcrref(3) = 3.5646025644535285e+02;
+         xcrref(4) = 3.2485447959084092e+02;
+         xcrref(5) = 3.2707541254659363e+03;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of solution error.
+//---------------------------------------------------------------------
+
+           xceref(1) = 5.2969847140936856e+01;
+           xceref(2) = 4.4632896115670668e+00;
+           xceref(3) = 1.3122573342210174e+01;
+           xceref(4) = 1.2006925323559144e+01;
+           xceref(5) = 1.2459576151035986e+02;
+
+//---------------------------------------------------------------------
+//    reference data for 162X162X162 grids after 200 time steps,
+//    with DT = 1.0e-04
+//---------------------------------------------------------------------
+        } else if ( (grid_points(1) == 162) && 
+                 (grid_points(2) == 162) &&
+                 (grid_points(3) == 162) &&
+                 (no_time_steps == 200) ) {
+
+           class = 'C';
+           dtref = 1.0e-4;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of residual.
+//---------------------------------------------------------------------
+         xcrref(1) = 0.62398116551764615e+04;
+         xcrref(2) = 0.50793239190423964e+03;
+         xcrref(3) = 0.15423530093013596e+04;
+         xcrref(4) = 0.13302387929291190e+04;
+         xcrref(5) = 0.11604087428436455e+05;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of solution error.
+//---------------------------------------------------------------------
+
+           xceref(1) = 0.16462008369091265e+03;
+           xceref(2) = 0.11497107903824313e+02;
+           xceref(3) = 0.41207446207461508e+02;
+           xceref(4) = 0.37087651059694167e+02;
+           xceref(5) = 0.36211053051841265e+03;
+
+//---------------------------------------------------------------------
+//    reference data for 408x408x408 grids after 250 time steps,
+//    with DT = 0.2e-04
+//---------------------------------------------------------------------
+        } else if ( (grid_points(1) == 408) && 
+                 (grid_points(2) == 408) &&
+                 (grid_points(3) == 408) &&
+                 (no_time_steps == 250) ) {
+
+           class = 'D';
+           dtref = 0.2e-4;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of residual.
+//---------------------------------------------------------------------
+         xcrref(1) = 0.2533188551738e+05;
+         xcrref(2) = 0.2346393716980e+04;
+         xcrref(3) = 0.6294554366904e+04;
+         xcrref(4) = 0.5352565376030e+04;
+         xcrref(5) = 0.3905864038618e+05;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of solution error.
+//---------------------------------------------------------------------
+
+           xceref(1) = 0.3100009377557e+03;
+           xceref(2) = 0.2424086324913e+02;
+           xceref(3) = 0.7782212022645e+02;
+           xceref(4) = 0.6835623860116e+02;
+           xceref(5) = 0.6065737200368e+03;
+
+//---------------------------------------------------------------------
+//    reference data for 1020x1020x1020 grids after 250 time steps,
+//    with DT = 0.4e-05
+//---------------------------------------------------------------------
+        } else if ( (grid_points(1) == 1020) && 
+                 (grid_points(2) == 1020) &&
+                 (grid_points(3) == 1020) &&
+                 (no_time_steps == 250) ) {
+
+           class = 'E';
+           dtref = 0.4e-5;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of residual.
+//---------------------------------------------------------------------
+         xcrref(1) = 0.9795372484517e+05;
+         xcrref(2) = 0.9739814511521e+04;
+         xcrref(3) = 0.2467606342965e+05;
+         xcrref(4) = 0.2092419572860e+05;
+         xcrref(5) = 0.1392138856939e+06;
+
+//---------------------------------------------------------------------
+//  Reference values of RMS-norms of solution error.
+//---------------------------------------------------------------------
+
+           xceref(1) = 0.4327562208414e+03;
+           xceref(2) = 0.3699051964887e+02;
+           xceref(3) = 0.1089845040954e+03;
+           xceref(4) = 0.9462517622043e+02;
+           xceref(5) = 0.7765512765309e+03;
+
+        } else {
+           verified = 0;
+        }
+
+//---------------------------------------------------------------------
+//    verification test for residuals if gridsize is one of 
+//    the defined grid sizes above (class != 'U')
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//    Compute the difference of solution values and the known reference 
+//    values.
+//---------------------------------------------------------------------
+        for (m = 1; m <= 5; m++) {
+           
+           xcrdif(m) = FABS((xcr(m)-xcrref(m))/xcrref(m)) ;
+           xcedif(m) = FABS((xce(m)-xceref(m))/xceref(m));
+           
+        }
+
+//---------------------------------------------------------------------
+//    Output the comparison of computed results to known cases.
+//---------------------------------------------------------------------
+
+        if (class != 'U') {
+           printf(" Verification being performed for class %c\n", class);
+           printf(" accuracy setting for epsilon = %20.13e\n", epsilon);
+           verified = (FABS(dt-dtref) <= epsilon);
+           if (!verified) {
+              class = 'U';
+              printf(" DT does not match the reference value of %15.8e\n", 
+                       dtref);
+           }
+        } else {
+           printf(" Unknown class\n");
+        }
+
+
+        if (class != 'U') {
+           printf(" Comparison of RMS-norms of residual\n");
+        } else {
+           printf(" RMS-norms of residual\n");
+        }
+
+        for (m = 1; m <= 5; m++) {
+           if (class == 'U') {
+              printf("          %2d %20.13e\n",
+                      m, xcr(m));
+           } else if (xcrdif(m) <= epsilon) {
+              printf("          %2d %20.13e %20.13e %20.13e\n",
+                      m,xcr(m),xcrref(m),xcrdif(m));
+           } else {
+              verified = 0;
+              printf(" FAILURE: %2d %20.13e %20.13e %20.13e\n",
+                      m,xcr(m),xcrref(m),xcrdif(m));
+           }
+        }
+
+        if (class != 'U') {
+           printf(" Comparison of RMS-norms of solution error\n");
+        } else {
+           printf(" RMS-norms of solution error\n");
+        }
+        
+        for (m = 1; m <= 5; m++) {
+           if (class == 'U') {
+              printf("          %2d %20.13e\n",
+                      m, xce(m));
+           } else if (xcedif(m) <= epsilon) {
+              printf("          %2d %20.13e %20.13e %20.13e\n",
+                      m,xce(m),xceref(m),xcedif(m));
+           } else {
+              verified = 0;
+              printf(" FAILURE: %2d %20.13e %20.13e %20.13e\n",
+                      m,xce(m),xceref(m),xcedif(m));
+           }
+        }
+        
+        
+        if (class == 'U') {
+           printf(" No reference values provided\n");
+           printf(" No verification performed\n");
+        } else if (verified) {
+           printf(" Verification Successful\n");
+        } else {
+           printf(" Verification failed\n");
+        }
+
+        *class_r = class;
+        *verified_r = verified;
+
+        return;
+
+
+}

+ 33 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/work_lhs.h.svn-base

@@ -0,0 +1,33 @@
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+//
+//  work_lhs.h
+//
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+#ifndef __WORK_LHS_H
+#define __WORK_LHS_H
+
+#define fjac(m,n,i) fjac[(m-1)+5*((n-1)+5*(i+2))]
+#define njac(m,n,i) njac[(m-1)+5*((n-1)+5*(i+2))]
+#define lhsa(m,n,i) lhsa[(m-1)+5*((n-1)+5*(i+1))]
+#define lhsb(m,n,i) lhsb[(m-1)+5*((n-1)+5*(i+1))]
+
+#ifdef G_MAIN
+      double fjac[5*5*(MAX_CELL_DIM+4)],
+                       njac[5*5*(MAX_CELL_DIM+4)],
+                       lhsa[5*5*(MAX_CELL_DIM+2)],
+                       lhsb[5*5*(MAX_CELL_DIM+2)],
+                       tmp1, tmp2, tmp3;
+//      common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3;
+#else
+extern double fjac[5*5*(MAX_CELL_DIM+4)],
+                       njac[5*5*(MAX_CELL_DIM+4)],
+                       lhsa[5*5*(MAX_CELL_DIM+2)],
+                       lhsb[5*5*(MAX_CELL_DIM+2)],
+                       tmp1, tmp2, tmp3;
+#endif /*G_MAIN*/
+#ifdef _OPENMP
+#pragma omp threadprivate (fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3)
+#endif
+#endif

+ 632 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/x_solve.c.svn-base

@@ -0,0 +1,632 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+#include "mpinpb.h"
+#define G_MAIN
+#include "work_lhs.h"
+#undef G_MAIN
+
+extern void x_sendrecv_solve(int c, int cprev);
+extern void x_sendrecv_back(int c, int cprev);
+extern void x_backsubstitute(int first, int last, int c);
+extern void x_solve_cell(int first, int last, int c);
+
+void x_solve() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     Performs line solves in X direction by first factoring
+//     the block-tridiagonal matrix into an upper triangular matrix, 
+//     and then performing back substitution to solve for the unknown
+//     vectors of each line.  
+//     
+//     Make sure we treat elements zero to cell_size in the direction
+//     of the sweep.
+//---------------------------------------------------------------------
+
+      int  c, cprev, stage, first, last, error;
+
+//---------------------------------------------------------------------
+//     in our terminology stage is the number of the cell in the x-direction
+//     i.e. stage = 1 means the start of the line stage=ncells means end
+//---------------------------------------------------------------------
+      for (stage = 1; stage <= ncells; stage++) {
+         c = slice(1,stage);
+//---------------------------------------------------------------------
+//     set first/last-cell flags
+//---------------------------------------------------------------------
+         first = (stage == 1);
+         last =  (stage == ncells);
+
+        if (stage >1) {
+           cprev = slice(1,stage-1);
+           x_sendrecv_solve(c, cprev);
+        }
+        x_solve_cell(first,last,c);
+      }
+
+//---------------------------------------------------------------------
+//     now perform backsubstitution in reverse direction
+//---------------------------------------------------------------------
+      for (stage = ncells; stage >= 1; stage--) {
+         c = slice(1,stage);
+         first = (stage == 1);
+         last =  (stage == ncells);
+
+         if (stage <ncells) {
+            cprev = slice(1,stage+1);
+            x_sendrecv_back(c, cprev);
+         }
+
+         x_backsubstitute(first,last,c);
+      }
+
+      return;
+}
+      
+      
+void x_sendrecv_solve(int c, int cprev) {
+
+//---------------------------------------------------------------------
+//     pack up and send C'(iend) and rhs'(iend) for
+//     all j and k of previous cell
+//---------------------------------------------------------------------
+
+      int j,k,m,n,isize,ptr, istart;
+      int phase;
+      int error, buffer_size;
+
+      isize = cell_size(1,cprev)-1;
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+           (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE);
+
+//---------------------------------------------------------------------
+//     pack up buffer
+//---------------------------------------------------------------------
+      ptr = 0;
+      for (k = 0; k <= KMAX-1; k++) {
+         for (j = 0; j <= JMAX-1; j++) {
+            for (m = 1; m <= BLOCK_SIZE; m++) {
+               for (n = 1; n <= BLOCK_SIZE; n++) {
+                  in_buffer(ptr+n) = lhsc(m,n,isize,j,k,cprev);
+               }
+               ptr = ptr+BLOCK_SIZE;
+            }
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               in_buffer(ptr+n) = rhs(n,isize,j,k,cprev);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+//---------------------------------------------------------------------
+//     send and receive buffer 
+//---------------------------------------------------------------------
+
+      for (phase = 0; phase < 3; phase++) {
+
+        if (send_color[EASTDIR]==phase) 
+          RCCE_send((char*)in_buffer, buffer_size*sizeof(double), successor(1));
+        if (recv_color[EASTDIR]==phase) 
+          RCCE_recv((char*)out_buffer, buffer_size*sizeof(double), predecessor(1));
+      }
+
+//---------------------------------------------------------------------
+//     unpack buffer
+//---------------------------------------------------------------------
+      istart = 0;
+      ptr = 0;
+      for (k = 0; k <= KMAX-1; k++) {
+         for (j = 0; j <= JMAX-1; j++) {
+            for (m = 1; m <= BLOCK_SIZE; m++) {
+               for (n = 1; n <= BLOCK_SIZE; n++) {
+                  lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n);
+               }
+               ptr = ptr+BLOCK_SIZE;
+            }
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               rhs(n,istart-1,j,k,c) = out_buffer(ptr+n);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+      return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void x_sendrecv_back(int c, int cprev) {
+
+//---------------------------------------------------------------------
+//     pack up and send U(istart) for all j and k
+//---------------------------------------------------------------------
+
+      int j,k,n,ptr,istart,jp,kp;
+      int phase;
+      int error, buffer_size;
+
+//---------------------------------------------------------------------
+//     Send element 0 to previous processor
+//---------------------------------------------------------------------
+      istart = 0;
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE;
+      ptr = 0;
+      for (k = 0; k <= KMAX-1; k++) {
+         for (j = 0; j <= JMAX-1; j++) {
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               in_buffer(ptr+n) = rhs(n,istart,j,k,cprev);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+//---------------------------------------------------------------------
+//     send and receive buffer 
+//---------------------------------------------------------------------
+
+      for (phase = 0; phase < 3; phase++) {
+
+        if (send_color[WESTDIR]==phase) 
+          RCCE_send((char*)in_buffer, buffer_size*sizeof(double), predecessor(1));
+        if (recv_color[WESTDIR]==phase) 
+          RCCE_recv((char*)out_buffer, buffer_size*sizeof(double), successor(1));
+      }
+
+//---------------------------------------------------------------------
+//     unpack U(isize) for all j and k
+//---------------------------------------------------------------------
+
+      ptr = 0;
+      for (k = 0; k <= KMAX-1; k++) {
+         for (j = 0; j <= JMAX-1; j++) {
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               backsub_info(n,j,k,c) = out_buffer(ptr+n);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+      return;
+}
+      
+void x_backsubstitute(int first, int last, int c) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     back solve: if last cell, then generate U(isize)=rhs(isize)
+//     else assume U(isize) is loaded in un pack backsub_info
+//     so just use it
+//     after call u(istart) will be sent to next cell
+//---------------------------------------------------------------------
+
+      int i, j, k;
+      int m,n,isize,jsize,ksize,istart;
+      
+      istart = 0;
+      isize = cell_size(1,c)-1;
+      jsize = cell_size(2,c)-end(2,c)-1      ;
+      ksize = cell_size(3,c)-end(3,c)-1;
+      if (last == 0) {
+         for (k = start(3,c); k <= ksize; k++) {
+            for (j = start(2,c); j <= jsize; j++) {
+//---------------------------------------------------------------------
+//     U(isize) uses info from previous cell if not last cell
+//---------------------------------------------------------------------
+               for (m = 1; m <= BLOCK_SIZE; m++) {
+                  for (n = 1; n <= BLOCK_SIZE; n++) {
+                     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+                          - lhsc(m,n,isize,j,k,c)*
+                          backsub_info(n,j,k,c);
+                  }
+               }
+            }
+         }
+      }
+      for (k = start(3,c); k <= ksize; k++) {
+         for (j = start(2,c); j <= jsize; j++) {
+            for (i = isize-1; i >= istart; i--) {
+               for (m = 1; m <= BLOCK_SIZE; m++) {
+                  for (n = 1; n <= BLOCK_SIZE; n++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+                          - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c);
+                  }
+               }
+            }
+         }
+      }
+
+      return;
+}
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void x_solve_cell(int first, int last, int c) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     performs guaussian elimination on this cell.
+//     
+//     assumes that unpacking routines for non-first cells 
+//     preload C' and rhs' from previous cell.
+//     
+//     assumed send happens outside this routine, but that
+//     c'(IMAX) and rhs'(IMAX) will be sent to next cell
+//---------------------------------------------------------------------
+
+      int i,j,k,isize,ksize,jsize,istart;
+
+      istart = 0;
+      isize = cell_size(1,c)-1;
+      jsize = cell_size(2,c)-end(2,c)-1;
+      ksize = cell_size(3,c)-end(3,c)-1;
+
+      lhsabinit(lhsa, lhsb, isize);
+
+      for (k = start(3,c); k <= ksize; k++) {
+         for (j = start(2,c); j <= jsize; j++) {
+
+//---------------------------------------------------------------------
+//     This function computes the left hand side in the xi-direction
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     determine a (labeled f) and n jacobians for cell c
+//---------------------------------------------------------------------
+            for (i = start(1,c)-1; i <= cell_size(1,c) - end(1,c); i++) {
+
+               tmp1 = rho_i(i,j,k,c);
+               tmp2 = tmp1 * tmp1;
+               tmp3 = tmp1 * tmp2;
+//---------------------------------------------------------------------
+//     
+//---------------------------------------------------------------------
+               fjac(1,1,i) = 0.0e+00;
+               fjac(1,2,i) = 1.0e+00;
+               fjac(1,3,i) = 0.0e+00;
+               fjac(1,4,i) = 0.0e+00;
+               fjac(1,5,i) = 0.0e+00;
+
+               fjac(2,1,i) = -(u(2,i,j,k,c) * tmp2 * 
+                    u(2,i,j,k,c))
+                    + c2 * qs(i,j,k,c);
+               fjac(2,2,i) = ( 2.0e+00 - c2 )
+                    * ( u(2,i,j,k,c) * tmp1 );
+               fjac(2,3,i) = - c2 * ( u(3,i,j,k,c) * tmp1 );
+               fjac(2,4,i) = - c2 * ( u(4,i,j,k,c) * tmp1 );
+               fjac(2,5,i) = c2;
+
+               fjac(3,1,i) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2;
+               fjac(3,2,i) = u(3,i,j,k,c) * tmp1;
+               fjac(3,3,i) = u(2,i,j,k,c) * tmp1;
+               fjac(3,4,i) = 0.0e+00;
+               fjac(3,5,i) = 0.0e+00;
+
+               fjac(4,1,i) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2;
+               fjac(4,2,i) = u(4,i,j,k,c) * tmp1;
+               fjac(4,3,i) = 0.0e+00;
+               fjac(4,4,i) = u(2,i,j,k,c) * tmp1;
+               fjac(4,5,i) = 0.0e+00;
+
+               fjac(5,1,i) = ( c2 * 2.0e0 * qs(i,j,k,c)
+                    - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+                    * ( u(2,i,j,k,c) * tmp1 );
+               fjac(5,2,i) = c1 *  u(5,i,j,k,c) * tmp1 
+                    - c2
+                    * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
+                    + qs(i,j,k,c) );
+               fjac(5,3,i) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
+                    * tmp2;
+               fjac(5,4,i) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
+                    * tmp2;
+               fjac(5,5,i) = c1 * ( u(2,i,j,k,c) * tmp1 );
+
+               njac(1,1,i) = 0.0e+00;
+               njac(1,2,i) = 0.0e+00;
+               njac(1,3,i) = 0.0e+00;
+               njac(1,4,i) = 0.0e+00;
+               njac(1,5,i) = 0.0e+00;
+
+               njac(2,1,i) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c);
+               njac(2,2,i) =   con43 * c3c4 * tmp1;
+               njac(2,3,i) =   0.0e+00;
+               njac(2,4,i) =   0.0e+00;
+               njac(2,5,i) =   0.0e+00;
+
+               njac(3,1,i) = - c3c4 * tmp2 * u(3,i,j,k,c);
+               njac(3,2,i) =   0.0e+00;
+               njac(3,3,i) =   c3c4 * tmp1;
+               njac(3,4,i) =   0.0e+00;
+               njac(3,5,i) =   0.0e+00;
+
+               njac(4,1,i) = - c3c4 * tmp2 * u(4,i,j,k,c);
+               njac(4,2,i) =   0.0e+00 ;
+               njac(4,3,i) =   0.0e+00;
+               njac(4,4,i) =   c3c4 * tmp1;
+               njac(4,5,i) =   0.0e+00;
+
+               njac(5,1,i) = - ( con43 * c3c4
+                    - c1345 ) * tmp3 * SQR(u(2,i,j,k,c))
+                    - ( c3c4 - c1345 ) * tmp3 * SQR(u(3,i,j,k,c))
+                    - ( c3c4 - c1345 ) * tmp3 * SQR(u(4,i,j,k,c))
+                    - c1345 * tmp2 * u(5,i,j,k,c);
+
+               njac(5,2,i) = ( con43 * c3c4
+                    - c1345 ) * tmp2 * u(2,i,j,k,c);
+               njac(5,3,i) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c);
+               njac(5,4,i) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c);
+               njac(5,5,i) = ( c1345 ) * tmp1;
+
+            }
+//---------------------------------------------------------------------
+//     now jacobians set, so form left hand side in x direction
+//---------------------------------------------------------------------
+            for (i = start(1,c); i <= isize - end(1,c); i++) {
+
+               tmp1 = dt * tx1;
+               tmp2 = dt * tx2;
+
+               lhsa(1,1,i) = - tmp2 * fjac(1,1,i-1)
+                    - tmp1 * njac(1,1,i-1)
+                    - tmp1 * dx1 ;
+               lhsa(1,2,i) = - tmp2 * fjac(1,2,i-1)
+                    - tmp1 * njac(1,2,i-1);
+               lhsa(1,3,i) = - tmp2 * fjac(1,3,i-1)
+                    - tmp1 * njac(1,3,i-1);
+               lhsa(1,4,i) = - tmp2 * fjac(1,4,i-1)
+                    - tmp1 * njac(1,4,i-1);
+               lhsa(1,5,i) = - tmp2 * fjac(1,5,i-1)
+                    - tmp1 * njac(1,5,i-1);
+
+               lhsa(2,1,i) = - tmp2 * fjac(2,1,i-1)
+                    - tmp1 * njac(2,1,i-1);
+               lhsa(2,2,i) = - tmp2 * fjac(2,2,i-1)
+                    - tmp1 * njac(2,2,i-1)
+                    - tmp1 * dx2;
+               lhsa(2,3,i) = - tmp2 * fjac(2,3,i-1)
+                    - tmp1 * njac(2,3,i-1);
+               lhsa(2,4,i) = - tmp2 * fjac(2,4,i-1)
+                    - tmp1 * njac(2,4,i-1);
+               lhsa(2,5,i) = - tmp2 * fjac(2,5,i-1)
+                    - tmp1 * njac(2,5,i-1);
+
+               lhsa(3,1,i) = - tmp2 * fjac(3,1,i-1)
+                    - tmp1 * njac(3,1,i-1);
+               lhsa(3,2,i) = - tmp2 * fjac(3,2,i-1)
+                    - tmp1 * njac(3,2,i-1);
+               lhsa(3,3,i) = - tmp2 * fjac(3,3,i-1)
+                    - tmp1 * njac(3,3,i-1)
+                    - tmp1 * dx3 ;
+               lhsa(3,4,i) = - tmp2 * fjac(3,4,i-1)
+                    - tmp1 * njac(3,4,i-1);
+               lhsa(3,5,i) = - tmp2 * fjac(3,5,i-1)
+                    - tmp1 * njac(3,5,i-1);
+
+               lhsa(4,1,i) = - tmp2 * fjac(4,1,i-1)
+                    - tmp1 * njac(4,1,i-1);
+               lhsa(4,2,i) = - tmp2 * fjac(4,2,i-1)
+                    - tmp1 * njac(4,2,i-1);
+               lhsa(4,3,i) = - tmp2 * fjac(4,3,i-1)
+                    - tmp1 * njac(4,3,i-1);
+               lhsa(4,4,i) = - tmp2 * fjac(4,4,i-1)
+                    - tmp1 * njac(4,4,i-1)
+                    - tmp1 * dx4;
+               lhsa(4,5,i) = - tmp2 * fjac(4,5,i-1)
+                    - tmp1 * njac(4,5,i-1);
+
+               lhsa(5,1,i) = - tmp2 * fjac(5,1,i-1)
+                    - tmp1 * njac(5,1,i-1);
+               lhsa(5,2,i) = - tmp2 * fjac(5,2,i-1)
+                    - tmp1 * njac(5,2,i-1);
+               lhsa(5,3,i) = - tmp2 * fjac(5,3,i-1)
+                    - tmp1 * njac(5,3,i-1);
+               lhsa(5,4,i) = - tmp2 * fjac(5,4,i-1)
+                    - tmp1 * njac(5,4,i-1);
+               lhsa(5,5,i) = - tmp2 * fjac(5,5,i-1)
+                    - tmp1 * njac(5,5,i-1)
+                    - tmp1 * dx5;
+
+               lhsb(1,1,i) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(1,1,i)
+                    + tmp1 * 2.0e+00 * dx1;
+               lhsb(1,2,i) = tmp1 * 2.0e+00 * njac(1,2,i);
+               lhsb(1,3,i) = tmp1 * 2.0e+00 * njac(1,3,i);
+               lhsb(1,4,i) = tmp1 * 2.0e+00 * njac(1,4,i);
+               lhsb(1,5,i) = tmp1 * 2.0e+00 * njac(1,5,i);
+
+               lhsb(2,1,i) = tmp1 * 2.0e+00 * njac(2,1,i);
+               lhsb(2,2,i) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(2,2,i)
+                    + tmp1 * 2.0e+00 * dx2;
+               lhsb(2,3,i) = tmp1 * 2.0e+00 * njac(2,3,i);
+               lhsb(2,4,i) = tmp1 * 2.0e+00 * njac(2,4,i);
+               lhsb(2,5,i) = tmp1 * 2.0e+00 * njac(2,5,i);
+
+               lhsb(3,1,i) = tmp1 * 2.0e+00 * njac(3,1,i);
+               lhsb(3,2,i) = tmp1 * 2.0e+00 * njac(3,2,i);
+               lhsb(3,3,i) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(3,3,i)
+                    + tmp1 * 2.0e+00 * dx3;
+               lhsb(3,4,i) = tmp1 * 2.0e+00 * njac(3,4,i);
+               lhsb(3,5,i) = tmp1 * 2.0e+00 * njac(3,5,i);
+
+               lhsb(4,1,i) = tmp1 * 2.0e+00 * njac(4,1,i);
+               lhsb(4,2,i) = tmp1 * 2.0e+00 * njac(4,2,i);
+               lhsb(4,3,i) = tmp1 * 2.0e+00 * njac(4,3,i);
+               lhsb(4,4,i) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(4,4,i)
+                    + tmp1 * 2.0e+00 * dx4;
+               lhsb(4,5,i) = tmp1 * 2.0e+00 * njac(4,5,i);
+
+               lhsb(5,1,i) = tmp1 * 2.0e+00 * njac(5,1,i);
+               lhsb(5,2,i) = tmp1 * 2.0e+00 * njac(5,2,i);
+               lhsb(5,3,i) = tmp1 * 2.0e+00 * njac(5,3,i);
+               lhsb(5,4,i) = tmp1 * 2.0e+00 * njac(5,4,i);
+               lhsb(5,5,i) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(5,5,i)
+                    + tmp1 * 2.0e+00 * dx5;
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i+1)
+                    - tmp1 * njac(1,1,i+1)
+                    - tmp1 * dx1;
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i+1)
+                    - tmp1 * njac(1,2,i+1);
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i+1)
+                    - tmp1 * njac(1,3,i+1);
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i+1)
+                    - tmp1 * njac(1,4,i+1);
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i+1)
+                    - tmp1 * njac(1,5,i+1);
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i+1)
+                    - tmp1 * njac(2,1,i+1);
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i+1)
+                    - tmp1 * njac(2,2,i+1)
+                    - tmp1 * dx2;
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i+1)
+                    - tmp1 * njac(2,3,i+1);
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i+1)
+                    - tmp1 * njac(2,4,i+1);
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i+1)
+                    - tmp1 * njac(2,5,i+1);
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i+1)
+                    - tmp1 * njac(3,1,i+1);
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i+1)
+                    - tmp1 * njac(3,2,i+1);
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i+1)
+                    - tmp1 * njac(3,3,i+1)
+                    - tmp1 * dx3;
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i+1)
+                    - tmp1 * njac(3,4,i+1);
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i+1)
+                    - tmp1 * njac(3,5,i+1);
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i+1)
+                    - tmp1 * njac(4,1,i+1);
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i+1)
+                    - tmp1 * njac(4,2,i+1);
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i+1)
+                    - tmp1 * njac(4,3,i+1);
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i+1)
+                    - tmp1 * njac(4,4,i+1)
+                    - tmp1 * dx4;
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i+1)
+                    - tmp1 * njac(4,5,i+1);
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i+1)
+                    - tmp1 * njac(5,1,i+1);
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i+1)
+                    - tmp1 * njac(5,2,i+1);
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i+1)
+                    - tmp1 * njac(5,3,i+1);
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i+1)
+                    - tmp1 * njac(5,4,i+1);
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i+1)
+                    - tmp1 * njac(5,5,i+1)
+                    - tmp1 * dx5;
+
+            }
+
+
+//---------------------------------------------------------------------
+//     outer most do loops - sweeping in i direction
+//---------------------------------------------------------------------
+            if (first == 1) {
+
+//---------------------------------------------------------------------
+//     multiply c(istart,j,k) by b_inverse and copy back to c
+//     multiply rhs(istart) by b_inverse(istart) and copy to rhs
+//---------------------------------------------------------------------
+               binvcrhs( &lhsb(1,1,istart),
+                              &lhsc(1,1,istart,j,k,c),
+                              &rhs(1,istart,j,k,c) );
+
+            }
+
+//---------------------------------------------------------------------
+//     begin inner most do loop
+//     do all the elements of the cell unless last 
+//---------------------------------------------------------------------
+            for (i = istart+first; i <= isize-last; i++) {
+
+//---------------------------------------------------------------------
+//     rhs(i) = rhs(i) - A*rhs(i-1)
+//---------------------------------------------------------------------
+               matvec_sub(&lhsa(1,1,i),
+                               &rhs(1,i-1,j,k,c),&rhs(1,i,j,k,c));
+
+//---------------------------------------------------------------------
+//     B(i) = B(i) - C(i-1)*A(i)
+//---------------------------------------------------------------------
+               matmul_sub(&lhsa(1,1,i),
+                               &lhsc(1,1,i-1,j,k,c),
+                               &lhsb(1,1,i));
+
+
+//---------------------------------------------------------------------
+//     multiply c(i,j,k) by b_inverse and copy back to c
+//     multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
+//---------------------------------------------------------------------
+               binvcrhs( &lhsb(1,1,i),
+                              &lhsc(1,1,i,j,k,c),
+                              &rhs(1,i,j,k,c) );
+
+            }
+
+//---------------------------------------------------------------------
+//     Now finish up special cases for last cell
+//---------------------------------------------------------------------
+            if (last == 1) {
+
+//---------------------------------------------------------------------
+//     rhs(isize) = rhs(isize) - A*rhs(isize-1)
+//---------------------------------------------------------------------
+               matvec_sub(&lhsa(1,1,isize),
+                               &rhs(1,isize-1,j,k,c),&rhs(1,isize,j,k,c));
+
+//---------------------------------------------------------------------
+//     B(isize) = B(isize) - C(isize-1)*A(isize)
+//---------------------------------------------------------------------
+               matmul_sub(&lhsa(1,1,isize),
+                               &lhsc(1,1,isize-1,j,k,c),
+                               &lhsb(1,1,isize));
+
+//---------------------------------------------------------------------
+//     multiply rhs() by b_inverse() and copy to rhs
+//---------------------------------------------------------------------
+               binvrhs( &lhsb(1,1,isize),
+                             &rhs(1,isize,j,k,c) );
+
+            }
+         }
+      }
+
+
+      return;
+}
+      

+ 646 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/y_solve.c.svn-base

@@ -0,0 +1,646 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+#include "mpinpb.h"
+#include "work_lhs.h"
+
+extern void y_sendrecv_solve(int c, int cprev);
+extern void y_sendrecv_back(int c, int cprev);
+extern void y_backsubstitute(int first, int last, int c);
+extern void y_solve_cell(int first, int last, int c);
+
+void y_solve() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     Performs line solves in Y direction by first factoring
+//     the block-tridiagonal matrix into an upper triangular matrix, 
+//     and then performing back substitution to solve for the unknow
+//     vectors of each line.  
+//     
+//     Make sure we treat elements zero to cell_size in the direction
+//     of the sweep.
+//---------------------------------------------------------------------
+
+      int  c, cprev, stage, first, last, error;
+
+//---------------------------------------------------------------------
+//     in our terminology stage is the number of the cell in the y-direction
+//     i.e. stage = 1 means the start of the line stage=ncells means end
+//---------------------------------------------------------------------
+      for (stage = 1; stage <= ncells; stage++) {
+         c = slice(2,stage);
+//---------------------------------------------------------------------
+//     set last-cell flag
+//---------------------------------------------------------------------
+         first = (stage == 1);
+         last =  (stage == ncells);
+
+        if (stage >1) {
+           cprev = slice(2,stage-1);
+           y_sendrecv_solve(c, cprev);
+        }
+        y_solve_cell(first,last,c);
+      }
+
+//---------------------------------------------------------------------
+//     now perform backsubstitution in reverse direction
+//---------------------------------------------------------------------
+      for (stage = ncells; stage >= 1; stage--) {
+         c = slice(2,stage);
+         first = (stage == 1);
+         last =  (stage == ncells);
+
+         if (stage <ncells) {
+            cprev = slice(2,stage+1);
+            y_sendrecv_back(c, cprev);
+         }
+
+         y_backsubstitute(first,last,c);
+      }
+
+      return;
+}
+      
+      
+void y_sendrecv_solve(int c, int cprev) {
+
+//---------------------------------------------------------------------
+//     pack up and send C'(jend) and rhs'(jend) for
+//     all i and k
+//---------------------------------------------------------------------
+
+      int i,k,m,n,jsize,ptr,jstart;
+      int phase;
+      int error,buffer_size;
+
+      jsize = cell_size(2,cprev)-1;
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+           (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE);
+
+//---------------------------------------------------------------------
+//     pack up buffer
+//---------------------------------------------------------------------
+      ptr = 0;
+      for (k = 0; k <= KMAX-1; k++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (m = 1; m <= BLOCK_SIZE; m++) {
+               for (n = 1; n <= BLOCK_SIZE; n++) {
+                  in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,cprev);
+               }
+               ptr = ptr+BLOCK_SIZE;
+            }
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               in_buffer(ptr+n) = rhs(n,i,jsize,k,cprev);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+//---------------------------------------------------------------------
+//     send and receive buffer 
+//---------------------------------------------------------------------
+
+      for (phase = 0; phase < 3; phase++) {
+
+        if (send_color[NORTHDIR]==phase) 
+          RCCE_send((char*)in_buffer, buffer_size*sizeof(double), successor(2));
+        if (recv_color[NORTHDIR]==phase) 
+          RCCE_recv((char*)out_buffer, buffer_size*sizeof(double), predecessor(2));
+      }
+
+//---------------------------------------------------------------------
+//     unpack buffer
+//---------------------------------------------------------------------
+      jstart = 0;
+      ptr = 0;
+      for (k = 0; k <= KMAX-1; k++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (m = 1; m <= BLOCK_SIZE; m++) {
+               for (n = 1; n <= BLOCK_SIZE; n++) {
+                  lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n);
+               }
+               ptr = ptr+BLOCK_SIZE;
+            }
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+      return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void y_sendrecv_back(int c, int cprev) {
+
+//---------------------------------------------------------------------
+//     pack up and send U(jstart) for all i and k
+//---------------------------------------------------------------------
+
+      int i,k,n,ptr,jstart;
+      int phase;
+      int error,buffer_size;
+
+//---------------------------------------------------------------------
+//     Send element 0 to previous processor
+//---------------------------------------------------------------------
+      jstart = 0;
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE;
+      ptr = 0;
+      for (k = 0; k <= KMAX-1; k++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               in_buffer(ptr+n) = rhs(n,i,jstart,k,cprev);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+//---------------------------------------------------------------------
+//     send and receive buffer 
+//---------------------------------------------------------------------
+
+      for (phase = 0; phase < 3; phase++) {
+
+        if (send_color[SOUTHDIR]==phase) 
+          RCCE_send((char*)in_buffer, buffer_size*sizeof(double), predecessor(2));
+        if (recv_color[SOUTHDIR]==phase) 
+          RCCE_recv((char*)out_buffer, buffer_size*sizeof(double), successor(2));
+      }
+
+//---------------------------------------------------------------------
+//     unpack U(jsize) for all i and k
+//---------------------------------------------------------------------
+
+      ptr = 0;
+      for (k = 0; k <= KMAX-1; k++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               backsub_info(n,i,k,c) = out_buffer(ptr+n);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+      return;
+}
+
+void y_backsubstitute(int first, int last, int c) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     back solve: if last cell, then generate U(jsize)=rhs(jsize)
+//     else assume U(jsize) is loaded in un pack backsub_info
+//     so just use it
+//     after call u(jstart) will be sent to next cell
+//---------------------------------------------------------------------
+
+      int i, k;
+      int m,n,j,jsize,isize,ksize,jstart;
+      
+      jstart = 0;
+      isize = cell_size(1,c)-end(1,c)-1      ;
+      jsize = cell_size(2,c)-1;
+      ksize = cell_size(3,c)-end(3,c)-1;
+      if (last == 0) {
+         for (k = start(3,c); k <= ksize; k++) {
+            for (i = start(1,c); i <= isize; i++) {
+//---------------------------------------------------------------------
+//     U(jsize) uses info from previous cell if not last cell
+//---------------------------------------------------------------------
+               for (m = 1; m <= BLOCK_SIZE; m++) {
+                  for (n = 1; n <= BLOCK_SIZE; n++) {
+                     rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) 
+                          - lhsc(m,n,i,jsize,k,c)*
+                          backsub_info(n,i,k,c);
+                  }
+               }
+            }
+         }
+      }
+      for (k = start(3,c); k <= ksize; k++) {
+         for (j = jsize-1; j >= jstart; j--) {
+            for (i = start(1,c); i <= isize; i++) {
+               for (m = 1; m <= BLOCK_SIZE; m++) {
+                  for (n = 1; n <= BLOCK_SIZE; n++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+                          - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c);
+                  }
+               }
+            }
+         }
+      }
+
+      return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void y_solve_cell(int first,int last,int c) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     performs guaussian elimination on this cell.
+//     
+//     assumes that unpacking routines for non-first cells 
+//     preload C' and rhs' from previous cell.
+//     
+//     assumed send happens outside this routine, but that
+//     c'(JMAX) and rhs'(JMAX) will be sent to next cell
+//---------------------------------------------------------------------
+
+      int i,j,k,isize,ksize,jsize,jstart;
+      double utmp[6*(JMAX+4)];
+#define utmp(m,i) utmp[(m-1)+6*(i+2)]
+
+      jstart = 0;
+      isize = cell_size(1,c)-end(1,c)-1;
+      jsize = cell_size(2,c)-1;
+      ksize = cell_size(3,c)-end(3,c)-1;
+
+      lhsabinit(lhsa, lhsb, jsize);
+
+      for (k = start(3,c); k <= ksize; k++) {
+         for (i = start(1,c); i <= isize; i++) {
+
+//---------------------------------------------------------------------
+//     This function computes the left hand side for the three y-factors 
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     Compute the indices for storing the tri-diagonal matrix;
+//     determine a (labeled f) and n jacobians for cell c
+//---------------------------------------------------------------------
+            for (j = start(2,c)-1; j <= cell_size(2,c)-end(2,c); j++) {
+               utmp(1,j) = 1.0e0 / u(1,i,j,k,c);
+               utmp(2,j) = u(2,i,j,k,c);
+               utmp(3,j) = u(3,i,j,k,c);
+               utmp(4,j) = u(4,i,j,k,c);
+               utmp(5,j) = u(5,i,j,k,c);
+               utmp(6,j) = qs(i,j,k,c);
+            }
+
+            for (j = start(2,c)-1; j <= cell_size(2,c)-end(2,c); j++) {
+
+               tmp1 = utmp(1,j);
+               tmp2 = tmp1 * tmp1;
+               tmp3 = tmp1 * tmp2;
+
+               fjac(1,1,j) = 0.0e+00;
+               fjac(1,2,j) = 0.0e+00;
+               fjac(1,3,j) = 1.0e+00;
+               fjac(1,4,j) = 0.0e+00;
+               fjac(1,5,j) = 0.0e+00;
+
+               fjac(2,1,j) = - ( utmp(2,j)*utmp(3,j) )
+                    * tmp2;
+               fjac(2,2,j) = utmp(3,j) * tmp1;
+               fjac(2,3,j) = utmp(2,j) * tmp1;
+               fjac(2,4,j) = 0.0e+00;
+               fjac(2,5,j) = 0.0e+00;
+
+               fjac(3,1,j) = - ( utmp(3,j)*utmp(3,j)*tmp2)
+                    + c2 * utmp(6,j);
+               fjac(3,2,j) = - c2 *  utmp(2,j) * tmp1;
+               fjac(3,3,j) = ( 2.0e+00 - c2 )
+                    *  utmp(3,j) * tmp1 ;
+               fjac(3,4,j) = - c2 * utmp(4,j) * tmp1 ;
+               fjac(3,5,j) = c2;
+
+               fjac(4,1,j) = - ( utmp(3,j)*utmp(4,j) )
+                    * tmp2;
+               fjac(4,2,j) = 0.0e+00;
+               fjac(4,3,j) = utmp(4,j) * tmp1;
+               fjac(4,4,j) = utmp(3,j) * tmp1;
+               fjac(4,5,j) = 0.0e+00;
+
+               fjac(5,1,j) = ( c2 * 2.0e0 * utmp(6,j)
+                    - c1 * utmp(5,j) * tmp1 ) 
+                    * utmp(3,j) * tmp1 ;
+               fjac(5,2,j) = - c2 * utmp(2,j)*utmp(3,j) 
+                    * tmp2;
+               fjac(5,3,j) = c1 * utmp(5,j) * tmp1 
+                    - c2 * ( utmp(6,j)
+                    + utmp(3,j)*utmp(3,j) * tmp2 );
+               fjac(5,4,j) = - c2 * ( utmp(3,j)*utmp(4,j) )
+                    * tmp2;
+               fjac(5,5,j) = c1 * utmp(3,j) * tmp1 ;
+
+               njac(1,1,j) = 0.0e+00;
+               njac(1,2,j) = 0.0e+00;
+               njac(1,3,j) = 0.0e+00;
+               njac(1,4,j) = 0.0e+00;
+               njac(1,5,j) = 0.0e+00;
+
+               njac(2,1,j) = - c3c4 * tmp2 * utmp(2,j);
+               njac(2,2,j) =   c3c4 * tmp1;
+               njac(2,3,j) =   0.0e+00;
+               njac(2,4,j) =   0.0e+00;
+               njac(2,5,j) =   0.0e+00;
+
+               njac(3,1,j) = - con43 * c3c4 * tmp2 * utmp(3,j);
+               njac(3,2,j) =   0.0e+00;
+               njac(3,3,j) =   con43 * c3c4 * tmp1;
+               njac(3,4,j) =   0.0e+00;
+               njac(3,5,j) =   0.0e+00;
+
+               njac(4,1,j) = - c3c4 * tmp2 * utmp(4,j);
+               njac(4,2,j) =   0.0e+00;
+               njac(4,3,j) =   0.0e+00;
+               njac(4,4,j) =   c3c4 * tmp1;
+               njac(4,5,j) =   0.0e+00;
+
+               njac(5,1,j) = - (  c3c4
+                    - c1345 ) * tmp3 * SQR(utmp(2,j))
+                    - ( con43 * c3c4
+                    - c1345 ) * tmp3 * SQR(utmp(3,j))
+                    - ( c3c4 - c1345 ) * tmp3 * SQR(utmp(4,j))
+                    - c1345 * tmp2 * utmp(5,j);
+
+               njac(5,2,j) = (  c3c4 - c1345 ) * tmp2 * utmp(2,j);
+               njac(5,3,j) = ( con43 * c3c4
+                    - c1345 ) * tmp2 * utmp(3,j);
+               njac(5,4,j) = ( c3c4 - c1345 ) * tmp2 * utmp(4,j);
+               njac(5,5,j) = ( c1345 ) * tmp1;
+
+            }
+
+//---------------------------------------------------------------------
+//     now joacobians set, so form left hand side in y direction
+//---------------------------------------------------------------------
+            for (j = start(2,c); j <= jsize-end(2,c); j++) {
+
+               tmp1 = dt * ty1;
+               tmp2 = dt * ty2;
+
+               lhsa(1,1,j) = - tmp2 * fjac(1,1,j-1)
+                    - tmp1 * njac(1,1,j-1)
+                    - tmp1 * dy1 ;
+               lhsa(1,2,j) = - tmp2 * fjac(1,2,j-1)
+                    - tmp1 * njac(1,2,j-1);
+               lhsa(1,3,j) = - tmp2 * fjac(1,3,j-1)
+                    - tmp1 * njac(1,3,j-1);
+               lhsa(1,4,j) = - tmp2 * fjac(1,4,j-1)
+                    - tmp1 * njac(1,4,j-1);
+               lhsa(1,5,j) = - tmp2 * fjac(1,5,j-1)
+                    - tmp1 * njac(1,5,j-1);
+
+               lhsa(2,1,j) = - tmp2 * fjac(2,1,j-1)
+                    - tmp1 * njac(2,1,j-1);
+               lhsa(2,2,j) = - tmp2 * fjac(2,2,j-1)
+                    - tmp1 * njac(2,2,j-1)
+                    - tmp1 * dy2;
+               lhsa(2,3,j) = - tmp2 * fjac(2,3,j-1)
+                    - tmp1 * njac(2,3,j-1);
+               lhsa(2,4,j) = - tmp2 * fjac(2,4,j-1)
+                    - tmp1 * njac(2,4,j-1);
+               lhsa(2,5,j) = - tmp2 * fjac(2,5,j-1)
+                    - tmp1 * njac(2,5,j-1);
+
+               lhsa(3,1,j) = - tmp2 * fjac(3,1,j-1)
+                    - tmp1 * njac(3,1,j-1);
+               lhsa(3,2,j) = - tmp2 * fjac(3,2,j-1)
+                    - tmp1 * njac(3,2,j-1);
+               lhsa(3,3,j) = - tmp2 * fjac(3,3,j-1)
+                    - tmp1 * njac(3,3,j-1)
+                    - tmp1 * dy3 ;
+               lhsa(3,4,j) = - tmp2 * fjac(3,4,j-1)
+                    - tmp1 * njac(3,4,j-1);
+               lhsa(3,5,j) = - tmp2 * fjac(3,5,j-1)
+                    - tmp1 * njac(3,5,j-1);
+
+               lhsa(4,1,j) = - tmp2 * fjac(4,1,j-1)
+                    - tmp1 * njac(4,1,j-1);
+               lhsa(4,2,j) = - tmp2 * fjac(4,2,j-1)
+                    - tmp1 * njac(4,2,j-1);
+               lhsa(4,3,j) = - tmp2 * fjac(4,3,j-1)
+                    - tmp1 * njac(4,3,j-1);
+               lhsa(4,4,j) = - tmp2 * fjac(4,4,j-1)
+                    - tmp1 * njac(4,4,j-1)
+                    - tmp1 * dy4;
+               lhsa(4,5,j) = - tmp2 * fjac(4,5,j-1)
+                    - tmp1 * njac(4,5,j-1);
+
+               lhsa(5,1,j) = - tmp2 * fjac(5,1,j-1)
+                    - tmp1 * njac(5,1,j-1);
+               lhsa(5,2,j) = - tmp2 * fjac(5,2,j-1)
+                    - tmp1 * njac(5,2,j-1);
+               lhsa(5,3,j) = - tmp2 * fjac(5,3,j-1)
+                    - tmp1 * njac(5,3,j-1);
+               lhsa(5,4,j) = - tmp2 * fjac(5,4,j-1)
+                    - tmp1 * njac(5,4,j-1);
+               lhsa(5,5,j) = - tmp2 * fjac(5,5,j-1)
+                    - tmp1 * njac(5,5,j-1)
+                    - tmp1 * dy5;
+
+               lhsb(1,1,j) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(1,1,j)
+                    + tmp1 * 2.0e+00 * dy1;
+               lhsb(1,2,j) = tmp1 * 2.0e+00 * njac(1,2,j);
+               lhsb(1,3,j) = tmp1 * 2.0e+00 * njac(1,3,j);
+               lhsb(1,4,j) = tmp1 * 2.0e+00 * njac(1,4,j);
+               lhsb(1,5,j) = tmp1 * 2.0e+00 * njac(1,5,j);
+
+               lhsb(2,1,j) = tmp1 * 2.0e+00 * njac(2,1,j);
+               lhsb(2,2,j) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(2,2,j)
+                    + tmp1 * 2.0e+00 * dy2;
+               lhsb(2,3,j) = tmp1 * 2.0e+00 * njac(2,3,j);
+               lhsb(2,4,j) = tmp1 * 2.0e+00 * njac(2,4,j);
+               lhsb(2,5,j) = tmp1 * 2.0e+00 * njac(2,5,j);
+
+               lhsb(3,1,j) = tmp1 * 2.0e+00 * njac(3,1,j);
+               lhsb(3,2,j) = tmp1 * 2.0e+00 * njac(3,2,j);
+               lhsb(3,3,j) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(3,3,j)
+                    + tmp1 * 2.0e+00 * dy3;
+               lhsb(3,4,j) = tmp1 * 2.0e+00 * njac(3,4,j);
+               lhsb(3,5,j) = tmp1 * 2.0e+00 * njac(3,5,j);
+
+               lhsb(4,1,j) = tmp1 * 2.0e+00 * njac(4,1,j);
+               lhsb(4,2,j) = tmp1 * 2.0e+00 * njac(4,2,j);
+               lhsb(4,3,j) = tmp1 * 2.0e+00 * njac(4,3,j);
+               lhsb(4,4,j) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(4,4,j)
+                    + tmp1 * 2.0e+00 * dy4;
+               lhsb(4,5,j) = tmp1 * 2.0e+00 * njac(4,5,j);
+
+               lhsb(5,1,j) = tmp1 * 2.0e+00 * njac(5,1,j);
+               lhsb(5,2,j) = tmp1 * 2.0e+00 * njac(5,2,j);
+               lhsb(5,3,j) = tmp1 * 2.0e+00 * njac(5,3,j);
+               lhsb(5,4,j) = tmp1 * 2.0e+00 * njac(5,4,j);
+               lhsb(5,5,j) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(5,5,j) 
+                    + tmp1 * 2.0e+00 * dy5;
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,j+1)
+                    - tmp1 * njac(1,1,j+1)
+                    - tmp1 * dy1;
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,j+1)
+                    - tmp1 * njac(1,2,j+1);
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,j+1)
+                    - tmp1 * njac(1,3,j+1);
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,j+1)
+                    - tmp1 * njac(1,4,j+1);
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,j+1)
+                    - tmp1 * njac(1,5,j+1);
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,j+1)
+                    - tmp1 * njac(2,1,j+1);
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,j+1)
+                    - tmp1 * njac(2,2,j+1)
+                    - tmp1 * dy2;
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,j+1)
+                    - tmp1 * njac(2,3,j+1);
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,j+1)
+                    - tmp1 * njac(2,4,j+1);
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,j+1)
+                    - tmp1 * njac(2,5,j+1);
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,j+1)
+                    - tmp1 * njac(3,1,j+1);
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,j+1)
+                    - tmp1 * njac(3,2,j+1);
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,j+1)
+                    - tmp1 * njac(3,3,j+1)
+                    - tmp1 * dy3;
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,j+1)
+                    - tmp1 * njac(3,4,j+1);
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,j+1)
+                    - tmp1 * njac(3,5,j+1);
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,j+1)
+                    - tmp1 * njac(4,1,j+1);
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,j+1)
+                    - tmp1 * njac(4,2,j+1);
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,j+1)
+                    - tmp1 * njac(4,3,j+1);
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,j+1)
+                    - tmp1 * njac(4,4,j+1)
+                    - tmp1 * dy4;
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,j+1)
+                    - tmp1 * njac(4,5,j+1);
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,j+1)
+                    - tmp1 * njac(5,1,j+1);
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,j+1)
+                    - tmp1 * njac(5,2,j+1);
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,j+1)
+                    - tmp1 * njac(5,3,j+1);
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,j+1)
+                    - tmp1 * njac(5,4,j+1);
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,j+1)
+                    - tmp1 * njac(5,5,j+1)
+                    - tmp1 * dy5;
+
+            }
+
+
+//---------------------------------------------------------------------
+//     outer most do loops - sweeping in i direction
+//---------------------------------------------------------------------
+            if (first == 1) {
+
+//---------------------------------------------------------------------
+//     multiply c(i,jstart,k) by b_inverse and copy back to c
+//     multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
+//---------------------------------------------------------------------
+               binvcrhs( &lhsb(1,1,jstart),
+                              &lhsc(1,1,i,jstart,k,c),
+                              &rhs(1,i,jstart,k,c) );
+
+            }
+
+//---------------------------------------------------------------------
+//     begin inner most do loop
+//     do all the elements of the cell unless last 
+//---------------------------------------------------------------------
+            for (j = jstart+first; j <= jsize-last; j++) {
+
+//---------------------------------------------------------------------
+//     subtract A*lhs_vector(j-1) from lhs_vector(j)
+//     
+//     rhs(j) = rhs(j) - A*rhs(j-1)
+//---------------------------------------------------------------------
+               matvec_sub(&lhsa(1,1,j),
+                               &rhs(1,i,j-1,k,c),&rhs(1,i,j,k,c));
+
+//---------------------------------------------------------------------
+//     B(j) = B(j) - C(j-1)*A(j)
+//---------------------------------------------------------------------
+               matmul_sub(&lhsa(1,1,j),
+                               &lhsc(1,1,i,j-1,k,c),
+                               &lhsb(1,1,j));
+
+//---------------------------------------------------------------------
+//     multiply c(i,j,k) by b_inverse and copy back to c
+//     multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
+//---------------------------------------------------------------------
+               binvcrhs( &lhsb(1,1,j),
+                              &lhsc(1,1,i,j,k,c),
+                              &rhs(1,i,j,k,c) );
+
+            }
+
+//---------------------------------------------------------------------
+//     Now finish up special cases for last cell
+//---------------------------------------------------------------------
+            if (last == 1) {
+
+//---------------------------------------------------------------------
+//     rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
+//---------------------------------------------------------------------
+               matvec_sub(&lhsa(1,1,jsize),
+                               &rhs(1,i,jsize-1,k,c),&rhs(1,i,jsize,k,c));
+
+//---------------------------------------------------------------------
+//     B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
+//     call matmul_sub(aa,i,jsize,k,c,
+//     $              cc,i,jsize-1,k,c,bb,i,jsize,k,c)
+//---------------------------------------------------------------------
+               matmul_sub(&lhsa(1,1,jsize),
+                               &lhsc(1,1,i,jsize-1,k,c),
+                               &lhsb(1,1,jsize));
+
+//---------------------------------------------------------------------
+//     multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
+//---------------------------------------------------------------------
+               binvrhs( &lhsb(1,1,jsize),
+                             &rhs(1,i,jsize,k,c) );
+
+            }
+         }
+      }
+
+
+      return;
+}
+      
+
+

+ 689 - 0
RCCE_V2.0/apps/NPB/BT/.svn/text-base/z_solve.c.svn-base

@@ -0,0 +1,689 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+#include "mpinpb.h"
+#include "work_lhs.h"
+
+extern void z_sendrecv_solve(int c, int cprev);
+extern void z_sendrecv_back(int c, int cprev);
+extern void z_backsubstitute(int first, int last, int c);
+extern void z_solve_cell(int first, int last, int c);
+
+void z_solve() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     Performs line solves in Z direction by first factoring
+//     the block-tridiagonal matrix into an upper triangular matrix, 
+//     and then performing back substitution to solve for the unknow
+//     vectors of each line.  
+//     
+//     Make sure we treat elements zero to cell_size in the direction
+//     of the sweep.
+//---------------------------------------------------------------------
+
+      int  c, cprev, stage, first, last, error;
+
+//---------------------------------------------------------------------
+//     in our terminology stage is the number of the cell in the y-direction
+//     i.e. stage = 1 means the start of the line stage=ncells means end
+//---------------------------------------------------------------------
+      for (stage = 1; stage <= ncells; stage++) {
+         c = slice(3,stage);
+//---------------------------------------------------------------------
+//     set last-cell flag
+//---------------------------------------------------------------------
+         first = (stage == 1);
+         last =  (stage == ncells);
+
+        if (stage >1) {
+           cprev = slice(3,stage-1);
+           z_sendrecv_solve(c, cprev);
+        }
+        z_solve_cell(first,last,c);
+      }
+
+//---------------------------------------------------------------------
+//     now perform backsubstitution in reverse direction
+//---------------------------------------------------------------------
+      for (stage = ncells; stage >= 1; stage--) {
+         c = slice(3,stage);
+         first = (stage == 1);
+         last =  (stage == ncells);
+
+         if (stage <ncells) {
+            cprev = slice(3,stage+1);
+            z_sendrecv_back(c, cprev);
+         }
+
+         z_backsubstitute(first,last,c);
+      }
+
+      return;
+}
+      
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+      
+void z_unpack_solve_info(int c) {
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     unpack C'(-1) and rhs'(-1) for
+//     all i and j
+//---------------------------------------------------------------------
+
+      int i,j,m,n,ptr,kstart ;
+
+      kstart = 0;
+      ptr = 0;
+      for (j = 0; j <= JMAX-1; j++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (m = 1; m <= BLOCK_SIZE; m++) {
+               for (n = 1; n <= BLOCK_SIZE; n++) {
+                  lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n);
+               }
+               ptr = ptr+BLOCK_SIZE;
+            }
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+      return;
+}
+
+      
+void z_sendrecv_solve(int c, int cprev) {
+
+//---------------------------------------------------------------------
+//     pack up and send C'(kend) and rhs'(kend) for
+//     all i and j
+//---------------------------------------------------------------------
+
+      int i,j,m,n,ksize,ptr,kstart;
+      int phase;
+      int error,buffer_size;
+
+      ksize = cell_size(3,cprev)-1;
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+           (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE);
+
+//---------------------------------------------------------------------
+//     pack up buffer
+//---------------------------------------------------------------------
+      ptr = 0;
+      for (j = 0; j <= JMAX-1; j++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (m = 1; m <= BLOCK_SIZE; m++) {
+               for (n = 1; n <= BLOCK_SIZE; n++) {
+                  in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,cprev);
+               }
+               ptr = ptr+BLOCK_SIZE;
+            }
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               in_buffer(ptr+n) = rhs(n,i,j,ksize,cprev);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+//---------------------------------------------------------------------
+//     send and receive buffer 
+//---------------------------------------------------------------------
+
+      for (phase = 0; phase < 3; phase++) {
+
+        if (send_color[TOPDIR]==phase) 
+          RCCE_send((char*)in_buffer, buffer_size*sizeof(double), successor(3));
+        if (recv_color[TOPDIR]==phase) 
+          RCCE_recv((char*)out_buffer, buffer_size*sizeof(double), predecessor(3));
+      }
+
+//---------------------------------------------------------------------
+//     unpack buffer
+//---------------------------------------------------------------------
+      kstart = 0;
+      ptr = 0;
+      for (j = 0; j <= JMAX-1; j++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (m = 1; m <= BLOCK_SIZE; m++) {
+               for (n = 1; n <= BLOCK_SIZE; n++) {
+                  lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n);
+               }
+               ptr = ptr+BLOCK_SIZE;
+            }
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+      return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void z_sendrecv_back(int c, int cprev) {
+
+//---------------------------------------------------------------------
+//     pack up and send U(jstart) for all i and j
+//---------------------------------------------------------------------
+
+      int i,j,n,ptr,kstart;
+      int phase;
+      int error,buffer_size;
+
+//---------------------------------------------------------------------
+//     Send element 0 to previous processor
+//---------------------------------------------------------------------
+      kstart = 0;
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE;
+      ptr = 0;
+      for (j = 0; j <= JMAX-1; j++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               in_buffer(ptr+n) = rhs(n,i,j,kstart,cprev);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+//---------------------------------------------------------------------
+//     send and receive buffer 
+//---------------------------------------------------------------------
+
+      for (phase = 0; phase < 3; phase++) {
+
+        if (send_color[BOTTOMDIR]==phase) 
+          RCCE_send((char*)in_buffer, buffer_size*sizeof(double), predecessor(3));
+        if (recv_color[BOTTOMDIR]==phase) 
+          RCCE_recv((char*)out_buffer, buffer_size*sizeof(double), successor(3));
+      }
+
+//---------------------------------------------------------------------
+//     unpack U(ksize) for all i and j
+//---------------------------------------------------------------------
+
+      ptr = 0;
+      for (j = 0; j <= JMAX-1; j++) {
+         for (i = 0; i <= IMAX-1; i++) {
+            for (n = 1; n <= BLOCK_SIZE; n++) {
+               backsub_info(n,i,j,c) = out_buffer(ptr+n);
+            }
+            ptr = ptr+BLOCK_SIZE;
+         }
+      }
+
+      return;
+}
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void z_backsubstitute(int first, int last, int c) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     back solve: if last cell, then generate U(ksize)=rhs(ksize)
+//     else assume U(ksize) is loaded in un pack backsub_info
+//     so just use it
+//     after call u(kstart) will be sent to next cell
+//---------------------------------------------------------------------
+
+      int i, k;
+      int m,n,j,jsize,isize,ksize,kstart;
+      
+      kstart = 0;
+      isize = cell_size(1,c)-end(1,c)-1      ;
+      jsize = cell_size(2,c)-end(2,c)-1;
+      ksize = cell_size(3,c)-1;
+      if (last == 0) {
+         for (j = start(2,c); j <= jsize; j++) {
+            for (i = start(1,c); i <= isize; i++) {
+//---------------------------------------------------------------------
+//     U(jsize) uses info from previous cell if not last cell
+//---------------------------------------------------------------------
+               for (m = 1; m <= BLOCK_SIZE; m++) {
+                  for (n = 1; n <= BLOCK_SIZE; n++) {
+                     rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) 
+                          - lhsc(m,n,i,j,ksize,c)*
+                          backsub_info(n,i,j,c);
+                  }
+               }
+            }
+         }
+      }
+      for (k = ksize-1; k >= kstart; k--) {
+         for (j = start(2,c); j <= jsize; j++) {
+            for (i = start(1,c); i <= isize; i++) {
+               for (m = 1; m <= BLOCK_SIZE; m++) {
+                  for (n = 1; n <= BLOCK_SIZE; n++) {
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+                          - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c);
+                  }
+               }
+            }
+         }
+      }
+
+      return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void z_solve_cell(int first,int last,int c) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     performs guaussian elimination on this cell.
+//     
+//     assumes that unpacking routines for non-first cells 
+//     preload C' and rhs' from previous cell.
+//     
+//     assumed send happens outside this routine, but that
+//     c'(KMAX) and rhs'(KMAX) will be sent to next cell.
+//---------------------------------------------------------------------
+
+      int i,j,k,isize,ksize,jsize,kstart;
+      double utmp[6*(KMAX+4)];
+#define utmp(m,i) utmp[(m-1)+6*(i+2)]
+
+      kstart = 0;
+      isize = cell_size(1,c)-end(1,c)-1;
+      jsize = cell_size(2,c)-end(2,c)-1;
+      ksize = cell_size(3,c)-1;
+
+      lhsabinit(lhsa, lhsb, ksize);
+
+      for (j = start(2,c); j <= jsize; j++) {
+         for (i = start(1,c); i <= isize; i++) {
+
+//---------------------------------------------------------------------
+//     This function computes the left hand side for the three z-factors 
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     Compute the indices for storing the block-diagonal matrix;
+//     determine c (labeled f) and s jacobians for cell c
+//---------------------------------------------------------------------
+            for (k = start(3,c)-1; k <= cell_size(3,c)-end(3,c); k++) {
+               utmp(1,k) = 1.0e0 / u(1,i,j,k,c);
+               utmp(2,k) = u(2,i,j,k,c);
+               utmp(3,k) = u(3,i,j,k,c);
+               utmp(4,k) = u(4,i,j,k,c);
+               utmp(5,k) = u(5,i,j,k,c);
+               utmp(6,k) = qs(i,j,k,c);
+            }
+
+            for (k = start(3,c)-1; k <= cell_size(3,c)-end(3,c); k++) {
+
+               tmp1 = utmp(1,k);
+               tmp2 = tmp1 * tmp1;
+               tmp3 = tmp1 * tmp2;
+
+               fjac(1,1,k) = 0.0e+00;
+               fjac(1,2,k) = 0.0e+00;
+               fjac(1,3,k) = 0.0e+00;
+               fjac(1,4,k) = 1.0e+00;
+               fjac(1,5,k) = 0.0e+00;
+
+               fjac(2,1,k) = - ( utmp(2,k)*utmp(4,k) ) 
+                    * tmp2 ;
+               fjac(2,2,k) = utmp(4,k) * tmp1;
+               fjac(2,3,k) = 0.0e+00;
+               fjac(2,4,k) = utmp(2,k) * tmp1;
+               fjac(2,5,k) = 0.0e+00;
+
+               fjac(3,1,k) = - ( utmp(3,k)*utmp(4,k) )
+                    * tmp2 ;
+               fjac(3,2,k) = 0.0e+00;
+               fjac(3,3,k) = utmp(4,k) * tmp1;
+               fjac(3,4,k) = utmp(3,k) * tmp1;
+               fjac(3,5,k) = 0.0e+00;
+
+               fjac(4,1,k) = - (utmp(4,k)*utmp(4,k) * tmp2 ) 
+                    + c2 * utmp(6,k);
+               fjac(4,2,k) = - c2 *  utmp(2,k) * tmp1 ;
+               fjac(4,3,k) = - c2 *  utmp(3,k) * tmp1;
+               fjac(4,4,k) = ( 2.0e+00 - c2 )
+                    *  utmp(4,k) * tmp1 ;
+               fjac(4,5,k) = c2;
+
+               fjac(5,1,k) = ( c2 * 2.0e0 * utmp(6,k)
+                    - c1 * ( utmp(5,k) * tmp1 ) )
+                    * ( utmp(4,k) * tmp1 );
+               fjac(5,2,k) = - c2 * ( utmp(2,k)*utmp(4,k) )
+                    * tmp2 ;
+               fjac(5,3,k) = - c2 * ( utmp(3,k)*utmp(4,k) )
+                    * tmp2;
+               fjac(5,4,k) = c1 * ( utmp(5,k) * tmp1 )
+                    - c2 * ( utmp(6,k)
+                    + utmp(4,k)*utmp(4,k) * tmp2 );
+               fjac(5,5,k) = c1 * utmp(4,k) * tmp1;
+
+               njac(1,1,k) = 0.0e+00;
+               njac(1,2,k) = 0.0e+00;
+               njac(1,3,k) = 0.0e+00;
+               njac(1,4,k) = 0.0e+00;
+               njac(1,5,k) = 0.0e+00;
+
+               njac(2,1,k) = - c3c4 * tmp2 * utmp(2,k);
+               njac(2,2,k) =   c3c4 * tmp1;
+               njac(2,3,k) =   0.0e+00;
+               njac(2,4,k) =   0.0e+00;
+               njac(2,5,k) =   0.0e+00;
+
+               njac(3,1,k) = - c3c4 * tmp2 * utmp(3,k);
+               njac(3,2,k) =   0.0e+00;
+               njac(3,3,k) =   c3c4 * tmp1;
+               njac(3,4,k) =   0.0e+00;
+               njac(3,5,k) =   0.0e+00;
+
+               njac(4,1,k) = - con43 * c3c4 * tmp2 * utmp(4,k);
+               njac(4,2,k) =   0.0e+00;
+               njac(4,3,k) =   0.0e+00;
+               njac(4,4,k) =   con43 * c3 * c4 * tmp1;
+               njac(4,5,k) =   0.0e+00;
+
+               njac(5,1,k) = - (  c3c4
+                    - c1345 ) * tmp3 * SQR(utmp(2,k))
+                    - ( c3c4 - c1345 ) * tmp3 * SQR(utmp(3,k))
+                    - ( con43 * c3c4
+                    - c1345 ) * tmp3 * SQR(utmp(4,k))
+                    - c1345 * tmp2 * utmp(5,k);
+
+               njac(5,2,k) = (  c3c4 - c1345 ) * tmp2 * utmp(2,k);
+               njac(5,3,k) = (  c3c4 - c1345 ) * tmp2 * utmp(3,k);
+               njac(5,4,k) = ( con43 * c3c4
+                    - c1345 ) * tmp2 * utmp(4,k);
+               njac(5,5,k) = ( c1345 )* tmp1;
+
+
+            }
+
+//---------------------------------------------------------------------
+//     now joacobians set, so form left hand side in z direction
+//---------------------------------------------------------------------
+            for (k = start(3,c); k <= ksize-end(3,c); k++) {
+
+               tmp1 = dt * tz1;
+               tmp2 = dt * tz2;
+
+               lhsa(1,1,k) = - tmp2 * fjac(1,1,k-1)
+                    - tmp1 * njac(1,1,k-1)
+                    - tmp1 * dz1 ;
+               lhsa(1,2,k) = - tmp2 * fjac(1,2,k-1)
+                    - tmp1 * njac(1,2,k-1);
+               lhsa(1,3,k) = - tmp2 * fjac(1,3,k-1)
+                    - tmp1 * njac(1,3,k-1);
+               lhsa(1,4,k) = - tmp2 * fjac(1,4,k-1)
+                    - tmp1 * njac(1,4,k-1);
+               lhsa(1,5,k) = - tmp2 * fjac(1,5,k-1)
+                    - tmp1 * njac(1,5,k-1);
+
+               lhsa(2,1,k) = - tmp2 * fjac(2,1,k-1)
+                    - tmp1 * njac(2,1,k-1);
+               lhsa(2,2,k) = - tmp2 * fjac(2,2,k-1)
+                    - tmp1 * njac(2,2,k-1)
+                    - tmp1 * dz2;
+               lhsa(2,3,k) = - tmp2 * fjac(2,3,k-1)
+                    - tmp1 * njac(2,3,k-1);
+               lhsa(2,4,k) = - tmp2 * fjac(2,4,k-1)
+                    - tmp1 * njac(2,4,k-1);
+               lhsa(2,5,k) = - tmp2 * fjac(2,5,k-1)
+                    - tmp1 * njac(2,5,k-1);
+
+               lhsa(3,1,k) = - tmp2 * fjac(3,1,k-1)
+                    - tmp1 * njac(3,1,k-1);
+               lhsa(3,2,k) = - tmp2 * fjac(3,2,k-1)
+                    - tmp1 * njac(3,2,k-1);
+               lhsa(3,3,k) = - tmp2 * fjac(3,3,k-1)
+                    - tmp1 * njac(3,3,k-1)
+                    - tmp1 * dz3 ;
+               lhsa(3,4,k) = - tmp2 * fjac(3,4,k-1)
+                    - tmp1 * njac(3,4,k-1);
+               lhsa(3,5,k) = - tmp2 * fjac(3,5,k-1)
+                    - tmp1 * njac(3,5,k-1);
+
+               lhsa(4,1,k) = - tmp2 * fjac(4,1,k-1)
+                    - tmp1 * njac(4,1,k-1);
+               lhsa(4,2,k) = - tmp2 * fjac(4,2,k-1)
+                    - tmp1 * njac(4,2,k-1);
+               lhsa(4,3,k) = - tmp2 * fjac(4,3,k-1)
+                    - tmp1 * njac(4,3,k-1);
+               lhsa(4,4,k) = - tmp2 * fjac(4,4,k-1)
+                    - tmp1 * njac(4,4,k-1)
+                    - tmp1 * dz4;
+               lhsa(4,5,k) = - tmp2 * fjac(4,5,k-1)
+                    - tmp1 * njac(4,5,k-1);
+
+               lhsa(5,1,k) = - tmp2 * fjac(5,1,k-1)
+                    - tmp1 * njac(5,1,k-1);
+               lhsa(5,2,k) = - tmp2 * fjac(5,2,k-1)
+                    - tmp1 * njac(5,2,k-1);
+               lhsa(5,3,k) = - tmp2 * fjac(5,3,k-1)
+                    - tmp1 * njac(5,3,k-1);
+               lhsa(5,4,k) = - tmp2 * fjac(5,4,k-1)
+                    - tmp1 * njac(5,4,k-1);
+               lhsa(5,5,k) = - tmp2 * fjac(5,5,k-1)
+                    - tmp1 * njac(5,5,k-1)
+                    - tmp1 * dz5;
+
+               lhsb(1,1,k) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(1,1,k)
+                    + tmp1 * 2.0e+00 * dz1;
+               lhsb(1,2,k) = tmp1 * 2.0e+00 * njac(1,2,k);
+               lhsb(1,3,k) = tmp1 * 2.0e+00 * njac(1,3,k);
+               lhsb(1,4,k) = tmp1 * 2.0e+00 * njac(1,4,k);
+               lhsb(1,5,k) = tmp1 * 2.0e+00 * njac(1,5,k);
+
+               lhsb(2,1,k) = tmp1 * 2.0e+00 * njac(2,1,k);
+               lhsb(2,2,k) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(2,2,k)
+                    + tmp1 * 2.0e+00 * dz2;
+               lhsb(2,3,k) = tmp1 * 2.0e+00 * njac(2,3,k);
+               lhsb(2,4,k) = tmp1 * 2.0e+00 * njac(2,4,k);
+               lhsb(2,5,k) = tmp1 * 2.0e+00 * njac(2,5,k);
+
+               lhsb(3,1,k) = tmp1 * 2.0e+00 * njac(3,1,k);
+               lhsb(3,2,k) = tmp1 * 2.0e+00 * njac(3,2,k);
+               lhsb(3,3,k) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(3,3,k)
+                    + tmp1 * 2.0e+00 * dz3;
+               lhsb(3,4,k) = tmp1 * 2.0e+00 * njac(3,4,k);
+               lhsb(3,5,k) = tmp1 * 2.0e+00 * njac(3,5,k);
+
+               lhsb(4,1,k) = tmp1 * 2.0e+00 * njac(4,1,k);
+               lhsb(4,2,k) = tmp1 * 2.0e+00 * njac(4,2,k);
+               lhsb(4,3,k) = tmp1 * 2.0e+00 * njac(4,3,k);
+               lhsb(4,4,k) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(4,4,k)
+                    + tmp1 * 2.0e+00 * dz4;
+               lhsb(4,5,k) = tmp1 * 2.0e+00 * njac(4,5,k);
+
+               lhsb(5,1,k) = tmp1 * 2.0e+00 * njac(5,1,k);
+               lhsb(5,2,k) = tmp1 * 2.0e+00 * njac(5,2,k);
+               lhsb(5,3,k) = tmp1 * 2.0e+00 * njac(5,3,k);
+               lhsb(5,4,k) = tmp1 * 2.0e+00 * njac(5,4,k);
+               lhsb(5,5,k) = 1.0e+00
+                    + tmp1 * 2.0e+00 * njac(5,5,k) 
+                    + tmp1 * 2.0e+00 * dz5;
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,k+1)
+                    - tmp1 * njac(1,1,k+1)
+                    - tmp1 * dz1;
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,k+1)
+                    - tmp1 * njac(1,2,k+1);
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,k+1)
+                    - tmp1 * njac(1,3,k+1);
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,k+1)
+                    - tmp1 * njac(1,4,k+1);
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,k+1)
+                    - tmp1 * njac(1,5,k+1);
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,k+1)
+                    - tmp1 * njac(2,1,k+1);
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,k+1)
+                    - tmp1 * njac(2,2,k+1)
+                    - tmp1 * dz2;
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,k+1)
+                    - tmp1 * njac(2,3,k+1);
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,k+1)
+                    - tmp1 * njac(2,4,k+1);
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,k+1)
+                    - tmp1 * njac(2,5,k+1);
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,k+1)
+                    - tmp1 * njac(3,1,k+1);
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,k+1)
+                    - tmp1 * njac(3,2,k+1);
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,k+1)
+                    - tmp1 * njac(3,3,k+1)
+                    - tmp1 * dz3;
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,k+1)
+                    - tmp1 * njac(3,4,k+1);
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,k+1)
+                    - tmp1 * njac(3,5,k+1);
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,k+1)
+                    - tmp1 * njac(4,1,k+1);
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,k+1)
+                    - tmp1 * njac(4,2,k+1);
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,k+1)
+                    - tmp1 * njac(4,3,k+1);
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,k+1)
+                    - tmp1 * njac(4,4,k+1)
+                    - tmp1 * dz4;
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,k+1)
+                    - tmp1 * njac(4,5,k+1);
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,k+1)
+                    - tmp1 * njac(5,1,k+1);
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,k+1)
+                    - tmp1 * njac(5,2,k+1);
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,k+1)
+                    - tmp1 * njac(5,3,k+1);
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,k+1)
+                    - tmp1 * njac(5,4,k+1);
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,k+1)
+                    - tmp1 * njac(5,5,k+1)
+                    - tmp1 * dz5;
+
+            }
+
+
+//---------------------------------------------------------------------
+//     outer most do loops - sweeping in i direction
+//---------------------------------------------------------------------
+            if (first == 1) {
+
+//---------------------------------------------------------------------
+//     multiply c(i,j,kstart) by b_inverse and copy back to c
+//     multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
+//---------------------------------------------------------------------
+               binvcrhs( &lhsb(1,1,kstart),
+                              &lhsc(1,1,i,j,kstart,c),
+                              &rhs(1,i,j,kstart,c) );
+
+            }
+
+//---------------------------------------------------------------------
+//     begin inner most do loop
+//     do all the elements of the cell unless last 
+//---------------------------------------------------------------------
+            for (k = kstart+first; k <= ksize-last; k++) {
+
+//---------------------------------------------------------------------
+//     subtract A*lhs_vector(k-1) from lhs_vector(k)
+//     
+//     rhs(k) = rhs(k) - A*rhs(k-1)
+//---------------------------------------------------------------------
+               matvec_sub(&lhsa(1,1,k),
+                               &rhs(1,i,j,k-1,c),&rhs(1,i,j,k,c));
+
+//---------------------------------------------------------------------
+//     B(k) = B(k) - C(k-1)*A(k)
+//     call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
+//---------------------------------------------------------------------
+               matmul_sub(&lhsa(1,1,k),
+                               &lhsc(1,1,i,j,k-1,c),
+                               &lhsb(1,1,k));
+
+//---------------------------------------------------------------------
+//     multiply c(i,j,k) by b_inverse and copy back to c
+//     multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
+//---------------------------------------------------------------------
+               binvcrhs( &lhsb(1,1,k),
+                              &lhsc(1,1,i,j,k,c),
+                              &rhs(1,i,j,k,c) );
+
+            }
+
+//---------------------------------------------------------------------
+//     Now finish up special cases for last cell
+//---------------------------------------------------------------------
+            if (last == 1) {
+
+//---------------------------------------------------------------------
+//     rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
+//---------------------------------------------------------------------
+               matvec_sub(&lhsa(1,1,ksize),
+                               &rhs(1,i,j,ksize-1,c),&rhs(1,i,j,ksize,c));
+
+//---------------------------------------------------------------------
+//     B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
+//     call matmul_sub(aa,i,j,ksize,c,
+//     $              cc,i,j,ksize-1,c,bb,i,j,ksize,c)
+//---------------------------------------------------------------------
+               matmul_sub(&lhsa(1,1,ksize),
+                               &lhsc(1,1,i,j,ksize-1,c),
+                               &lhsb(1,1,ksize));
+
+//---------------------------------------------------------------------
+//     multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
+//---------------------------------------------------------------------
+               binvrhs( &lhsb(1,1,ksize),
+                             &rhs(1,i,j,ksize,c) );
+
+            }
+         }
+      }
+
+
+      return;
+}
+      
+
+
+
+
+

+ 65 - 0
RCCE_V2.0/apps/NPB/BT/Makefile

@@ -0,0 +1,65 @@
+SHELL=/bin/sh
+BENCHMARK=bt
+BENCHMARKU=BT
+
+PROGRAM  = $(BENCHMARK).$(CLASS).$(NPROCS)
+
+default:: ${PROGRAM}
+
+# This makes sure the configuration utility setparams 
+# is up to date. 
+# Note that this must be run every time, which is why the
+# target does not exist and is not created. 
+# If you create a file called "config" you will break things. 
+config:
+	cd ../sys; ${MAKE} all
+	../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS}
+
+# Normally setparams updates npbparams.h only if the settings (CLASS/NPROCS)
+# have changed. However, we also want to update if the compile options
+# may have changed (set in ../config/make.def). 
+npbparams.h: ../config/make.def
+	@ echo make.def modified. Rebuilding npbparams.h just in case
+	rm -f npbparams.h
+	../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS}
+
+# So that "make benchmark-name" works
+${BENCHMARK}:  default
+${BENCHMARKU}: default
+
+bt.o:             bt.c  header.h npbparams.h  mpinpb.h
+make_set.o:       make_set.c  header.h npbparams.h  mpinpb.h
+initialize.o:     initialize.c  header.h npbparams.h
+exact_solution.o: exact_solution.c  header.h npbparams.h
+exact_rhs.o:      exact_rhs.c  header.h npbparams.h
+set_constants.o:  set_constants.c  header.h npbparams.h
+adi.o:            adi.c  header.h npbparams.h
+define.o:         define.c  header.h npbparams.h
+copy_faces.o:     copy_faces.c  header.h npbparams.h  mpinpb.h
+rhs.o:            rhs.c  header.h npbparams.h
+x_solve.o:        x_solve.c  header.h work_lhs.h npbparams.h  mpinpb.h
+y_solve.o:        y_solve.c  header.h work_lhs.h npbparams.h  mpinpb.h
+z_solve.o:        z_solve.c  header.h work_lhs.h npbparams.h  mpinpb.h
+solve_subs.o:     solve_subs.c  npbparams.h
+add.o:            add.c  header.h npbparams.h
+error.o:          error.c  header.h npbparams.h  mpinpb.h
+verify.o:         verify.c  header.h npbparams.h  mpinpb.h
+setup_mpi.o:      setup_mpi.c mpinpb.h npbparams.h 
+
+
+OBJS = bt.o make_set.o initialize.o exact_solution.o \
+       exact_rhs.o set_constants.o adi.o define.o copy_faces.o  \
+       rhs.o x_solve.o y_solve.o z_solve.o add.o solve_subs.o   \
+       error.o verify.o setup_mpi.o print_results.o timers.o $(ARCHIVE) 
+
+$(PROGRAM): ${OBJS} 
+	${CCOMPILE} ${CFLAGS} -o ${PROGRAM} ${OBJS} 
+# use line below for gcc, which does not link libm by default
+#	${CCOMPILE} ${CFLAGS} -o ${PROGRAM} ${OBJS} -lm
+
+.c.o:
+	${CCOMPILE} -c $(CFLAGS)  $<
+
+clean:
+	- rm -f *.o *~ mputil*
+	- rm -f  npbparams.h core

+ 44 - 0
RCCE_V2.0/apps/NPB/BT/add.c

@@ -0,0 +1,44 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+#include "header.h"
+
+void  add() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     addition of update to the vector u
+//---------------------------------------------------------------------
+
+      int  c, i, j, k, m;
+
+      for (c = 1; c <= ncells; c++) {
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c);
+                  }
+               }
+            }
+         }
+      }
+
+      return;
+}

+ 34 - 0
RCCE_V2.0/apps/NPB/BT/adi.c

@@ -0,0 +1,34 @@
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+#include "header.h"
+#include "RCCE.h"
+
+void  adi() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+      copy_faces();
+      x_solve();
+      y_solve();
+      z_solve();
+      add();
+
+      return;
+}
+

+ 8 - 0
RCCE_V2.0/apps/NPB/BT/applu_macros.h

@@ -0,0 +1,8 @@
+/* PAD32byte is used to compute a cacheline padded length of n (input) bytes */
+#define  PAD32byte(n) ((n)%32==0 ? (n) : (n) + 32 - (n)%32)
+/* PAD32dbl is used to compute a cacheline padded length of n (input) doubles */
+#define  PAD32dbl(n)  ((n)%(32/sizeof(double))==0 ? (n) : (n) + (32/sizeof(double)) \
+                      - (n)%(32/sizeof(double)))
+
+#define max(x,y)      ((x)>(y)? (x) : (y))
+#define min(x,y)      ((x)<(y)? (x) : (y))

+ 38 - 0
RCCE_V2.0/apps/NPB/BT/applu_protos.h

@@ -0,0 +1,38 @@
+void blts(int);
+void buts(int, double *);
+void erhs();
+void error();
+void exact(int, int, int, double *);
+void exchange_1(double *, int, int);
+void exchange_3(double *, int);
+void exchange_4(double *, double *, int, int, int, int);
+void exchange_5(double *, int, int);
+void exchange_6(double *, int, int);
+void RCCE_allreduce_d(double *, double *, int, int);
+void init_comm(int *, char ***);
+void jacld(int);
+void jacu(int);
+void l2norm(int, int, int, double *, double *);
+void neighbors();
+void pintgr();
+void print_results(char *, char *, int *,  int *, int *, int *,
+                    int *, int *, double *, double *, char *,
+                    int *, char *, char *, char *, char *, char *,
+                    char *, char *, char *, char *);
+void proc_grid();
+void bcast_inputs();
+void read_input();
+void rhs();
+void setbv();
+void setcoeff();
+void setiv();
+void ssor(int);
+void subdomain();
+void timer_clear(int *);
+void timer_start(int *);
+void timer_stop(int *);
+void verify(double *, double *, double *, char *);
+int  nodedim();
+double timer_read(int *);
+double test_rsd();
+

+ 60 - 0
RCCE_V2.0/apps/NPB/BT/applu_share.h

@@ -0,0 +1,60 @@
+#include "npbparams.h"
+#include "applu_protos.h"
+#include "RCCE.h"
+
+extern double u[5*(isiz1+4)*(isiz2+4)*isiz3],
+              rsd[5*(isiz1+4)*(isiz2+4)*isiz3],
+              frct[5*(isiz1+4)*(isiz2+4)*isiz3],
+              flux[5*(isiz1+2)*(isiz2+2)*isiz3];
+extern double a[5*5*isiz1*isiz2],
+              b[5*5*isiz1*isiz2],
+              c[5*5*isiz1*isiz2],
+              d[5*5*isiz1*isiz2];
+
+extern double dt, omega, tolrsd[5], rsdnm[5], errnm[5], frc, ttotal;
+extern double tolrsd1_def, tolrsd2_def, tolrsd3_def, tolrsd4_def, tolrsd5_def,
+              omega_default;
+extern double ce[5*13];
+
+extern int ndim, id, num, xdim, ydim, row, col;
+extern int ii1, ii2, ji1, ji2, ki1, ki2;
+extern int itmax, invert; 
+extern int ipr, ipr_default, inorm;
+extern int north,south,east,west;
+extern int nx0, ny0, nz0;
+extern int nx, ny, nz;
+extern int ist, iend, jst, jend, ipt, jpt;
+extern int dp_type;
+extern double tx1, ty1, tz1, 
+              dx1, dy1, dz1, 
+              tx2, ty2, tz2, 
+              dx2, dy2, dz2, 
+              tx3, ty3, tz3, 
+              dx3, dy3, dz3, 
+              dx4, dy4, dz4, 
+              dx5, dy5, dz5, 
+              dssp, c1,  c2,  
+              c3,  c4,  c5;
+extern double dxi, deta, dzeta;
+extern double npmax, maxtime;
+extern double *buf1_exch_1;
+
+#ifdef _OPENMP
+#pragma omp threadprivate (nx, ny, nz, nx0, ny0, nz0, \
+                     ipt, ist, iend, jpt, jst, jend, \
+                     ii1, ii2, ji1, ji2, ki1, ki2, \
+                     dxi, deta, dzeta, \
+                     tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3)
+#pragma omp threadprivate (dx1, dx2, dx3, dx4, dx5, \
+                     dy1, dy2, dy3, dy4, dy5, \
+                     dz1, dz2, dz3, dz4, dz5, \
+                     dssp)
+#pragma omp threadprivate(u, rsd, frct, flux)
+#pragma omp threadprivate(ipr, inorm)
+#pragma omp threadprivate(itmax, invert, \
+                    dt, omega, tolrsd, rsdnm, errnm, frc, ttotal, \
+                    a, b, c, d)
+#pragma omp threadprivate(ce)
+#pragma omp threadprivate (id, ndim, num, xdim, ydim, row, col, \
+                     north,south,east,west, buf1_exch_1, npmax, maxtime)
+#endif

+ 216 - 0
RCCE_V2.0/apps/NPB/BT/bt.c

@@ -0,0 +1,216 @@
+//-------------------------------------------------------------------------!
+//                                                                         !
+//        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+//                                                                         !
+//                                   B T                                   !
+//                                                                         !
+//-------------------------------------------------------------------------!
+//                                                                         !
+//    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+//    It is described in NAS Technical Reports 95-020 and 02-007.          !
+//                                                                         !
+//    Permission to use, copy, distribute and modify this software         !
+//    for any purpose with or without fee is hereby granted.  We           !
+//    request, however, that all derived work reference the NAS            !
+//    Parallel Benchmarks 3.3. This software is provided "as is"           !
+//    without express or implied warranty.                                 !
+//                                                                         !
+//    Information on NPB 3.3, including the technical report, the          !
+//    original specifications, source code, results and information        !
+//    on how to submit new results, is available at:                       !
+//                                                                         !
+//           http://www.nas.nasa.gov/Software/NPB/                         !
+//                                                                         !
+//    Send comments or suggestions to  npb@nas.nasa.gov                    !
+//                                                                         !
+//          NAS Parallel Benchmarks Group                                  !
+//          NASA Ames Research Center                                      !
+//          Mail Stop: T27A-1                                              !
+//          Moffett Field, CA   94035-1000                                 !
+//                                                                         !
+//          E-mail:  npb@nas.nasa.gov                                      !
+//          Fax:     (650) 604-3957                                        !
+//                                                                         !
+//-------------------------------------------------------------------------!
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+//
+// Authors: R. F. Van der Wijngaart
+//          T. Harris
+//          M. Yarrow
+//
+//---------------------------------------------------------------------
+#include <stdio.h>
+#include <string.h>
+#include "RCCE.h"
+#include "applu_macros.h"
+#define G_MAIN
+#include "header.h"
+#include "mpinpb.h"
+
+#define BSIZE 132
+void make_color(void);
+void print_results(char*, char, int, int, int, int, int, int, double,
+                   double, char*, int, char*, char*, char*, char*, 
+                   char*, char*, char*, char*);
+
+//---------------------------------------------------------------------
+//      program MPBT;
+//---------------------------------------------------------------------
+int RCCE_APP(int argc, char **argv) {
+
+       int N = 1000, nothing;
+       int i, niter, step, c, error, fstatus;
+       double navg, mflops, mbytes, n3;
+       RCCE_COMM aux[N];
+
+       double t, tmax, tiominv, tpc;
+       int verified;
+       char class;
+       size_t chunk;
+
+       char cbuf[BSIZE];
+
+       if (setup_mpi(&argc, &argv)) {
+       RCCE_finalize();
+       return 0;
+       }
+
+//       RCCE_debug_set(RCCE_DEBUG_ALL);
+
+//---------------------------------------------------------------------
+//      Root node reads input file (if it exists) else takes
+//      defaults from parameters
+//---------------------------------------------------------------------
+       if (node == root) {
+          
+          printf("\n\n NAS Parallel Benchmarks 3.3 -- BT Benchmark\n");
+
+       }
+          niter = NITER_DEFAULT;
+          dt    = dt_default;
+          grid_points(1) = PROBLEM_SIZE;
+          grid_points(2) = PROBLEM_SIZE;
+          grid_points(3) = PROBLEM_SIZE;
+
+       if (node == root) {
+          printf(" Size: %4dx%4dx%4d\n", 
+                 grid_points(1), grid_points(2), grid_points(3));
+          printf(" Iterations: %4d    dt: %11.7f\n", niter, dt);
+          if (no_nodes != total_nodes)
+              printf(" Total number of processes: %5d\n", total_nodes);
+          if (no_nodes != MAXCELLS*MAXCELLS) 
+              printf(" WARNING: compiled for %5d processes\n",
+                     MAXCELLS*MAXCELLS);
+          printf(" Number of active processes: %5d\n\n", no_nodes);
+
+       }
+
+       make_set();
+       make_color();
+
+
+       for (c = 1; c <= MAXCELLS; c++) {
+          if ( (cell_size(1,c) > IMAX) ||
+               (cell_size(2,c) > JMAX) ||
+               (cell_size(3,c) > KMAX) ) {
+             printf(" %d %d %d %d %d\n", node, c, cell_size(1,c),
+                     cell_size(2,c), cell_size(3,c));
+             printf(" Problem size too big for compiled array sizes\n");
+          }
+       }
+
+       set_constants();
+
+       initialize();
+
+       lhsinit();
+
+       exact_rhs();
+
+       compute_buffer_size(5);
+
+//---------------------------------------------------------------------
+//      do one time step to touch all code, and reinitialize
+//---------------------------------------------------------------------
+       adi();
+
+       initialize();
+
+       timer_clear(2);
+
+//---------------------------------------------------------------------
+//      Synchronize before placing time stamp
+//---------------------------------------------------------------------
+       RCCE_barrier(&RCCE_COMM_WORLD);
+
+       timer_clear(1);
+       timer_start(1);
+
+       for (step = 1; step <= niter; step++) {
+
+          if (node == root) {
+             if ((step%20) == 0 || step == niter ||
+                 step == 1) {
+		 printf(" Time step %4d\n", step); fflush(stdout);
+             }
+          }
+          adi();
+       }
+
+       timer_stop(1);
+       t = timer_read(1);
+       
+       verify(niter, &class, &verified);
+
+       RCCE_reduce((char*)(&t), (char*)(&tmax), 1, RCCE_DOUBLE, RCCE_MAX, root, RCCE_COMM_WORLD);
+
+       if( node == root ) {
+          n3 = 1.0e0*grid_points(1)*grid_points(2)*grid_points(3);
+          navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0;
+          if( tmax != 0. ) {
+             mflops = 1.0e-6*(double)(niter)*
+               (3478.8*(double)n3-17655.7*navg*navg+28023.7*navg)
+               / tmax;
+          } else {
+             mflops = 0.0;
+          }
+
+         print_results("BT", class, grid_points[0], 
+           grid_points[1], grid_points[2], niter, MAXCELLS*MAXCELLS, 
+           total_nodes, tmax, mflops, "          floating point", 
+           verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, 
+           CS6);
+
+
+//         FILE *perf_file;
+//         char name[50] = "/shared/DEMOS/RCCE/NPB_BT/perf."; 
+//         char postfix[50]; 
+//         sprintf(postfix, "%d", total_nodes); 
+//         strcat(name, postfix); 
+//         perf_file = fopen(name,"w"); 
+//         fprintf(perf_file, "%d", (int)mflops); 
+//         fclose(perf_file); 
+       }
+
+
+       RCCE_finalize();
+
+       return 0;
+
+}
+

+ 338 - 0
RCCE_V2.0/apps/NPB/BT/copy_faces.c

@@ -0,0 +1,338 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+#include "mpinpb.h"
+
+void copy_faces() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     
+// This function copies the face values of a variable defined on a set 
+// of cells to the overlap locations of the adjacent sets of cells. 
+// Because a set of cells interfaces in each direction with exactly one 
+// other set, we only need to fill six different buffers. We could try to
+// overlap communication with computation, by computing
+// some internal values while communicating boundary values, but this
+// adds so much overhead that it's not clearly useful. 
+//---------------------------------------------------------------------
+
+      int i, j, k, c, m, p0, p1, phase,
+           p2, p3, p4, p5, b_size[6], ss[6], 
+           sr[6], error;
+
+#define b_size(m) b_size[m]
+#define ss(m) ss[m]
+#define sr(m) sr[m]
+
+//---------------------------------------------------------------------
+//     exit immediately if there are no faces to be copied           
+//---------------------------------------------------------------------
+      if (no_nodes == 1) {
+         compute_rhs();
+         return;
+      }
+
+      ss(0) = start_send_east;
+      ss(1) = start_send_west;
+      ss(2) = start_send_north;
+      ss(3) = start_send_south;
+      ss(4) = start_send_top;
+      ss(5) = start_send_bottom;
+
+      sr(0) = start_recv_east;
+      sr(1) = start_recv_west;
+      sr(2) = start_recv_north;
+      sr(3) = start_recv_south;
+      sr(4) = start_recv_top;
+      sr(5) = start_recv_bottom;
+
+      b_size(0) = east_size   ;
+      b_size(1) = west_size   ;
+      b_size(2) = north_size  ;
+      b_size(3) = south_size  ;
+      b_size(4) = top_size    ;
+      b_size(5) = bottom_size ;
+
+//---------------------------------------------------------------------
+//     because the difference stencil for the diagonalized scheme is 
+//     orthogonal, we do not have to perform the staged copying of faces,
+//     but can send all face information simultaneously to the neighboring
+//     cells in all directions          
+//---------------------------------------------------------------------
+      p0 = 0;
+      p1 = 0;
+      p2 = 0;
+      p3 = 0;
+      p4 = 0;
+      p5 = 0;
+
+      for (c = 1; c <= ncells; c++) {
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to eastern neighbors (i-dir)
+//---------------------------------------------------------------------
+         if (cell_coord(1,c) != ncells) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = cell_size(1,c)-2; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(0)+p0) = u(m,i,j,k,c);
+                        p0 = p0 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to western neighbors 
+//---------------------------------------------------------------------
+         if (cell_coord(1,c) != 1) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= 1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(1)+p1) = u(m,i,j,k,c);
+                        p1 = p1 + 1;
+                     }
+                  }
+               }
+            }
+
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to northern neighbors (j_dir)
+//---------------------------------------------------------------------
+         if (cell_coord(2,c) != ncells) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = cell_size(2,c)-2; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(2)+p2) = u(m,i,j,k,c);
+                        p2 = p2 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to southern neighbors 
+//---------------------------------------------------------------------
+         if (cell_coord(2,c)!= 1) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= 1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(3)+p3) = u(m,i,j,k,c);
+                        p3 = p3 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to top neighbors (k-dir)
+//---------------------------------------------------------------------
+         if (cell_coord(3,c) != ncells) {
+            for (k = cell_size(3,c)-2; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(4)+p4) = u(m,i,j,k,c);
+                        p4 = p4 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     fill the buffer to be sent to bottom neighbors
+//---------------------------------------------------------------------
+         if (cell_coord(3,c)!= 1) {
+            for (k = 0; k <= 1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        out_buffer(ss(5)+p5) = u(m,i,j,k,c);
+                        p5 = p5 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     cell loop
+//---------------------------------------------------------------------
+      }
+
+      for (phase = 0; phase < 3; phase++) {
+
+      if (send_color[WESTDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(1))), b_size(1)*sizeof(double), predecessor(1));
+      }
+      if (recv_color[WESTDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(0))),  b_size(0)*sizeof(double), successor(1));
+      }
+
+      if (send_color[EASTDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(0))), b_size(0)*sizeof(double), successor(1));
+      }
+      if (recv_color[EASTDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(1))),  b_size(1)*sizeof(double), predecessor(1));
+      }
+
+      if (send_color[SOUTHDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(3))), b_size(3)*sizeof(double), predecessor(2));
+      }
+      if (recv_color[SOUTHDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(2))),  b_size(2)*sizeof(double), successor(2));
+      }
+
+      if (send_color[NORTHDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(2))), b_size(2)*sizeof(double),successor(2));
+      }
+      if (recv_color[NORTHDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(3))),  b_size(3)*sizeof(double), predecessor(2));
+      }
+
+      if (send_color[BOTTOMDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(5))), b_size(5)*sizeof(double),predecessor(3));
+      }
+      if (recv_color[BOTTOMDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(4))),  b_size(4)*sizeof(double), successor(3));
+      }
+
+      if (send_color[TOPDIR]==phase)  {
+        RCCE_send((char*)(&out_buffer(ss(4))), b_size(4)*sizeof(double),successor(3));
+      }
+      if (recv_color[TOPDIR]==phase)  {
+        RCCE_recv((char*)(&in_buffer(sr(5))),  b_size(5)*sizeof(double), predecessor(3));
+      }
+   }      
+
+//---------------------------------------------------------------------
+//     unpack the data that has just been received;             
+//---------------------------------------------------------------------
+      p0 = 0;
+      p1 = 0;
+      p2 = 0;
+      p3 = 0;
+      p4 = 0;
+      p5 = 0;
+
+      for (c = 1; c <= ncells; c++) {
+
+         if (cell_coord(1,c) != 1) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = -2; i <= -1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(1)+p0);
+                        p0 = p0 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+         if (cell_coord(1,c) != ncells) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = cell_size(1,c); i <= cell_size(1,c)+1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(0)+p1);
+                        p1 = p1 + 1;
+                     }
+                  }
+               }
+            }
+         }
+            
+         if (cell_coord(2,c) != 1) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = -2; j <= -1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(3)+p2);
+                        p2 = p2 + 1;
+                     }
+                  }
+               }
+            }
+
+         }
+            
+         if (cell_coord(2,c) != ncells) {
+            for (k = 0; k <= cell_size(3,c)-1; k++) {
+               for (j = cell_size(2,c); j <= cell_size(2,c)+1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(2)+p3);
+                        p3 = p3 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+         if (cell_coord(3,c) != 1) {
+            for (k = -2; k <= -1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(5)+p4);
+                        p4 = p4 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+         if (cell_coord(3,c) != ncells) {
+            for (k = cell_size(3,c); k <= cell_size(3,c)+1; k++) {
+               for (j = 0; j <= cell_size(2,c)-1; j++) {
+                  for (i = 0; i <= cell_size(1,c)-1; i++) {
+                     for (m = 1; m <= 5; m++) {
+                        u(m,i,j,k,c) = in_buffer(sr(4)+p5);
+                        p5 = p5 + 1;
+                     }
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     cells loop
+//---------------------------------------------------------------------
+      }
+
+//---------------------------------------------------------------------
+//     do the rest of the rhs that uses the copied face values          
+//---------------------------------------------------------------------
+      compute_rhs();
+
+      return;
+}

+ 78 - 0
RCCE_V2.0/apps/NPB/BT/define.c

@@ -0,0 +1,78 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void compute_buffer_size(int dim) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+      int  c, face_size;
+
+      if (ncells == 1) return;
+
+//---------------------------------------------------------------------
+//     compute the actual sizes of the buffers; note that there is 
+//     always one cell face that doesn't need buffer space, because it 
+//     is at the boundary of the grid
+//---------------------------------------------------------------------
+      west_size = 0;
+      east_size = 0;
+
+      for (c = 1; c <= ncells; c++) {
+         face_size = cell_size(2,c) * cell_size(3,c) * dim * 2;
+         if (cell_coord(1,c)!=1) west_size = west_size + face_size;
+         if (cell_coord(1,c)!=ncells) east_size = east_size + 
+              face_size ;
+      }
+
+      north_size = 0;
+      south_size = 0;
+      for (c = 1; c <= ncells; c++) {
+         face_size = cell_size(1,c)*cell_size(3,c) * dim * 2;
+         if (cell_coord(2,c)!=1) south_size = south_size + face_size;
+         if (cell_coord(2,c)!=ncells) north_size = north_size + 
+              face_size ;
+      }
+
+      top_size = 0;
+      bottom_size = 0;
+      for (c = 1; c <= ncells; c++) {
+         face_size = cell_size(1,c) * cell_size(2,c) * dim * 2;
+         if (cell_coord(3,c)!=1) bottom_size = bottom_size + 
+              face_size;
+         if (cell_coord(3,c)!=ncells) top_size = top_size +
+              face_size     ;
+      }
+
+      start_send_west   = 1;
+      start_send_east   = start_send_west   + west_size;
+      start_send_south  = start_send_east   + east_size;
+      start_send_north  = start_send_south  + south_size;
+      start_send_bottom = start_send_north  + north_size;
+      start_send_top    = start_send_bottom + bottom_size;
+      start_recv_west   = 1;
+      start_recv_east   = start_recv_west   + west_size;
+      start_recv_south  = start_recv_east   + east_size;
+      start_recv_north  = start_recv_south  + south_size;
+      start_recv_bottom = start_recv_north  + north_size;
+      start_recv_top    = start_recv_bottom + bottom_size;
+
+      return;
+}
+

+ 121 - 0
RCCE_V2.0/apps/NPB/BT/error.c

@@ -0,0 +1,121 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include <math.h>
+#include "header.h"
+#include "mpinpb.h"
+#include "applu_macros.h"
+
+#define u_exact(m) u_exact[m-1]
+#define rms(m) rms[m-1]
+#define rms_work(m) rms_work[m-1]
+
+void error_norm(double rms[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     this function computes the norm of the difference between the
+//     computed solution and the exact solution
+//---------------------------------------------------------------------
+
+      int c, i, j, k, m, ii, jj, kk, d, error;
+      double xi, eta, zeta, u_exact[5], rms_work[5],
+           add;
+
+      for (m = 1; m <= 5; m++) {
+         rms_work(m) = 0.0e0;
+      }
+
+      for (c = 1; c <= ncells; c++) {
+         kk = 0;
+         for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+            zeta = (double)(k) * dnzm1;
+            jj = 0;
+            for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+               eta = (double)(j) * dnym1;
+               ii = 0;
+               for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+                  xi = (double)(i) * dnxm1;
+                  exact_solution(xi, eta, zeta, u_exact);
+
+                  for (m = 1; m <= 5; m++) {
+                     add = u(m,ii,jj,kk,c)-u_exact(m);
+                     rms_work(m) = rms_work(m) + add*add;
+                  }
+                  ii = ii + 1;
+               }
+               jj = jj + 1;
+            }
+            kk = kk + 1;
+         }
+      }
+
+      RCCE_allreduce((char*)rms_work, (char*)rms, 5, RCCE_DOUBLE, RCCE_SUM, RCCE_COMM_WORLD);
+
+      for (m = 1; m <= 5; m++) {
+         for (d = 1; d <= 3; d++) {
+            rms(m) = rms(m) / (double)(grid_points(d)-2);
+         }
+         rms(m) = sqrt(rms(m));
+      }
+
+      return;
+}
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void rhs_norm(double rms[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+      int c, i, j, k, d, m, error;
+      double rms_work[5], add;
+
+      for (m = 1; m <= 5; m++) {
+         rms_work(m) = 0.0e0;
+      }
+
+      for (c = 1; c <= ncells; c++) {
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     add = rhs(m,i,j,k,c);
+                     rms_work(m) = rms_work(m) + add*add;
+                  }
+               }
+            }
+         }
+      }
+
+      RCCE_allreduce((char*)rms_work, (char*)rms, 5, RCCE_DOUBLE, RCCE_SUM, RCCE_COMM_WORLD);
+
+      for (m = 1; m <= 5; m++) {
+         for (d = 1; d <= 3; d++) {
+            rms(m) = rms(m) / (double)(grid_points(d)-2);
+         }
+         rms(m) = sqrt(rms(m));
+      }
+
+      return;
+}
+

+ 375 - 0
RCCE_V2.0/apps/NPB/BT/exact_rhs.c

@@ -0,0 +1,375 @@
+
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void exact_rhs() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     compute the right hand side based on exact solution
+//---------------------------------------------------------------------
+
+      double dtemp[5], xi, eta, zeta, dtpp;
+      int          c, m, i, j, k, ip1, im1, jp1, 
+           jm1, km1, kp1;
+#define dtemp(m) dtemp[m-1]
+
+
+//---------------------------------------------------------------------
+//     loop over all cells owned by this node                   
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+
+//---------------------------------------------------------------------
+//     initialize                                  
+//---------------------------------------------------------------------
+         for (k = 0; k <= cell_size(3,c)-1; k++) {
+            for (j = 0; j <= cell_size(2,c)-1; j++) {
+               for (i = 0; i <= cell_size(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = 0.0e0;
+                  }
+               }
+            }
+         }
+
+//---------------------------------------------------------------------
+//     xi-direction flux differences                      
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            zeta = (double)(k+cell_low(3,c)) * dnzm1;
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               eta = (double)(j+cell_low(2,c)) * dnym1;
+
+               for (i = -2*(1-start(1,c)); i <= cell_size(1,c)+1-2*end(1,c); i++) {
+                  xi = (double)(i+cell_low(1,c)) * dnxm1;
+
+                  exact_solution(xi, eta, zeta, dtemp);
+                  for (m = 1; m <= 5; m++) {
+                     ue(i,m) = dtemp(m);
+                  }
+
+                  dtpp = 1.0e0 / dtemp(1);
+
+                  for (m = 2; m <= 5; m++) {
+                     buf(i,m) = dtpp * dtemp(m);
+                  }
+
+                  cuf(i)   = buf(i,2) * buf(i,2);
+                  buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + 
+                       buf(i,4) * buf(i,4) ;
+                  q(i) = 0.5e0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
+                       buf(i,4)*ue(i,4));
+
+               }
+               
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  im1 = i-1;
+                  ip1 = i+1;
+
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+                       tx2*( ue(ip1,2)-ue(im1,2) )+
+                       dx1tx1*(ue(ip1,1)-2.0e0*ue(i,1)+ue(im1,1));
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * (
+                       (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
+                       (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
+                       xxcon1*(buf(ip1,2)-2.0e0*buf(i,2)+buf(im1,2))+
+                       dx2tx1*( ue(ip1,2)-2.0e0* ue(i,2)+ue(im1,2));
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * (
+                       ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
+                       xxcon2*(buf(ip1,3)-2.0e0*buf(i,3)+buf(im1,3))+
+                       dx3tx1*( ue(ip1,3)-2.0e0*ue(i,3) +ue(im1,3));
+                  
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*(
+                       ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
+                       xxcon2*(buf(ip1,4)-2.0e0*buf(i,4)+buf(im1,4))+
+                       dx4tx1*( ue(ip1,4)-2.0e0* ue(i,4)+ ue(im1,4));
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*(
+                       buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
+                       buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
+                       0.5e0*xxcon3*(buf(ip1,1)-2.0e0*buf(i,1)+
+                       buf(im1,1))+
+                       xxcon4*(cuf(ip1)-2.0e0*cuf(i)+cuf(im1))+
+                       xxcon5*(buf(ip1,5)-2.0e0*buf(i,5)+buf(im1,5))+
+                       dx5tx1*( ue(ip1,5)-2.0e0* ue(i,5)+ ue(im1,5));
+               }
+
+//---------------------------------------------------------------------
+//     Fourth-order dissipation                         
+//---------------------------------------------------------------------
+               if (start(1,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     i = 1;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (5.0e0*ue(i,m) - 4.0e0*ue(i+1,m) +ue(i+2,m));
+                     i = 2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (-4.0e0*ue(i-1,m) + 6.0e0*ue(i,m) -
+                          4.0e0*ue(i+1,m) +       ue(i+2,m));
+                  }
+               }
+
+               for (i = start(1,c)*3; i <= cell_size(1,c)-3*end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+                          (ue(i-2,m) - 4.0e0*ue(i-1,m) +
+                          6.0e0*ue(i,m) - 4.0e0*ue(i+1,m) + ue(i+2,m));
+                  }
+               }
+
+               if (end(1,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     i = cell_size(1,c)-3;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(i-2,m) - 4.0e0*ue(i-1,m) +
+                          6.0e0*ue(i,m) - 4.0e0*ue(i+1,m));
+                     i = cell_size(1,c)-2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(i-2,m) - 4.0e0*ue(i-1,m) + 5.0e0*ue(i,m));
+                  }
+               }
+
+            }
+         }
+
+//---------------------------------------------------------------------
+//     eta-direction flux differences             
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            zeta = (double)(k+cell_low(3,c)) * dnzm1;
+            for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+               xi = (double)(i+cell_low(1,c)) * dnxm1;
+
+               for (j = -2*(1-start(2,c)); j <= cell_size(2,c)+1-2*end(2,c); j++) {
+                  eta = (double)(j+cell_low(2,c)) * dnym1;
+
+                  exact_solution(xi, eta, zeta, dtemp);
+                  for (m = 1; m <= 5; m++) {
+                     ue(j,m) = dtemp(m);
+                  }
+                  
+                  dtpp = 1.0e0/dtemp(1);
+
+                  for (m = 2; m <= 5; m++) {
+                     buf(j,m) = dtpp * dtemp(m);
+                  }
+
+                  cuf(j)   = buf(j,3) * buf(j,3);
+                  buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + 
+                       buf(j,4) * buf(j,4);
+                  q(j) = 0.5e0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
+                       buf(j,4)*ue(j,4));
+               }
+
+               for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+                  jm1 = j-1;
+                  jp1 = j+1;
+                  
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+                       ty2*( ue(jp1,3)-ue(jm1,3) )+
+                       dy1ty1*(ue(jp1,1)-2.0e0*ue(j,1)+ue(jm1,1));
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*(
+                       ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
+                       yycon2*(buf(jp1,2)-2.0e0*buf(j,2)+buf(jm1,2))+
+                       dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2));
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*(
+                       (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
+                       (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
+                       yycon1*(buf(jp1,3)-2.0e0*buf(j,3)+buf(jm1,3))+
+                       dy3ty1*( ue(jp1,3)-2.0e0*ue(j,3) +ue(jm1,3));
+
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*(
+                       ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
+                       yycon2*(buf(jp1,4)-2.0e0*buf(j,4)+buf(jm1,4))+
+                       dy4ty1*( ue(jp1,4)-2.0e0*ue(j,4)+ ue(jm1,4));
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*(
+                       buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
+                       buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
+                       0.5e0*yycon3*(buf(jp1,1)-2.0e0*buf(j,1)+
+                       buf(jm1,1))+
+                       yycon4*(cuf(jp1)-2.0e0*cuf(j)+cuf(jm1))+
+                       yycon5*(buf(jp1,5)-2.0e0*buf(j,5)+buf(jm1,5))+
+                       dy5ty1*(ue(jp1,5)-2.0e0*ue(j,5)+ue(jm1,5));
+               }
+
+//---------------------------------------------------------------------
+//     Fourth-order dissipation                      
+//---------------------------------------------------------------------
+               if (start(2,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     j = 1;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (5.0e0*ue(j,m) - 4.0e0*ue(j+1,m) +ue(j+2,m));
+                     j = 2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (-4.0e0*ue(j-1,m) + 6.0e0*ue(j,m) -
+                          4.0e0*ue(j+1,m) +       ue(j+2,m));
+                  }
+               }
+
+               for (j = start(2,c)*3; j <= cell_size(2,c)-3*end(2,c)-1; j++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+                          (ue(j-2,m) - 4.0e0*ue(j-1,m) +
+                          6.0e0*ue(j,m) - 4.0e0*ue(j+1,m) + ue(j+2,m));
+                  }
+               }
+
+               if (end(2,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     j = cell_size(2,c)-3;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(j-2,m) - 4.0e0*ue(j-1,m) +
+                          6.0e0*ue(j,m) - 4.0e0*ue(j+1,m));
+                     j = cell_size(2,c)-2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(j-2,m) - 4.0e0*ue(j-1,m) + 5.0e0*ue(j,m));
+
+                  }
+               }
+
+            }
+         }
+
+//---------------------------------------------------------------------
+//     zeta-direction flux differences                      
+//---------------------------------------------------------------------
+         for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+            eta = (double)(j+cell_low(2,c)) * dnym1;
+            for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+               xi = (double)(i+cell_low(1,c)) * dnxm1;
+
+               for (k = -2*(1-start(3,c)); k <= cell_size(3,c)+1-2*end(3,c); k++) {
+                  zeta = (double)(k+cell_low(3,c)) * dnzm1;
+
+                  exact_solution(xi, eta, zeta, dtemp);
+                  for (m = 1; m <= 5; m++) {
+                     ue(k,m) = dtemp(m);
+                  }
+
+                  dtpp = 1.0e0/dtemp(1);
+
+                  for (m = 2; m <= 5; m++) {
+                     buf(k,m) = dtpp * dtemp(m);
+                  }
+
+                  cuf(k)   = buf(k,4) * buf(k,4);
+                  buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + 
+                       buf(k,3) * buf(k,3);
+                  q(k) = 0.5e0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
+                       buf(k,4)*ue(k,4));
+               }
+
+               for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+                  km1 = k-1;
+                  kp1 = k+1;
+                  
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+                       tz2*( ue(kp1,4)-ue(km1,4) )+
+                       dz1tz1*(ue(kp1,1)-2.0e0*ue(k,1)+ue(km1,1));
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * (
+                       ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
+                       zzcon2*(buf(kp1,2)-2.0e0*buf(k,2)+buf(km1,2))+
+                       dz2tz1*( ue(kp1,2)-2.0e0* ue(k,2)+ ue(km1,2));
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * (
+                       ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
+                       zzcon2*(buf(kp1,3)-2.0e0*buf(k,3)+buf(km1,3))+
+                       dz3tz1*(ue(kp1,3)-2.0e0*ue(k,3)+ue(km1,3));
+
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * (
+                       (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
+                       (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
+                       zzcon1*(buf(kp1,4)-2.0e0*buf(k,4)+buf(km1,4))+
+                       dz4tz1*( ue(kp1,4)-2.0e0*ue(k,4) +ue(km1,4));
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * (
+                       buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
+                       buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
+                       0.5e0*zzcon3*(buf(kp1,1)-2.0e0*buf(k,1)
+                       +buf(km1,1))+
+                       zzcon4*(cuf(kp1)-2.0e0*cuf(k)+cuf(km1))+
+                       zzcon5*(buf(kp1,5)-2.0e0*buf(k,5)+buf(km1,5))+
+                       dz5tz1*( ue(kp1,5)-2.0e0*ue(k,5)+ ue(km1,5));
+               }
+
+//---------------------------------------------------------------------
+//     Fourth-order dissipation                        
+//---------------------------------------------------------------------
+               if (start(3,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     k = 1;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (5.0e0*ue(k,m) - 4.0e0*ue(k+1,m) +ue(k+2,m));
+                     k = 2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (-4.0e0*ue(k-1,m) + 6.0e0*ue(k,m) -
+                          4.0e0*ue(k+1,m) +       ue(k+2,m));
+                  }
+               }
+
+               for (k = start(3,c)*3; k <= cell_size(3,c)-3*end(3,c)-1; k++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+                          (ue(k-2,m) - 4.0e0*ue(k-1,m) +
+                          6.0e0*ue(k,m) - 4.0e0*ue(k+1,m) + ue(k+2,m));
+                  }
+               }
+
+               if (end(3,c) > 0) {
+                  for (m = 1; m <= 5; m++) {
+                     k = cell_size(3,c)-3;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(k-2,m) - 4.0e0*ue(k-1,m) +
+                          6.0e0*ue(k,m) - 4.0e0*ue(k+1,m));
+                     k = cell_size(3,c)-2;
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+                          (ue(k-2,m) - 4.0e0*ue(k-1,m) + 5.0e0*ue(k,m));
+                  }
+               }
+
+            }
+         }
+
+//---------------------------------------------------------------------
+//     now change the sign of the forcing function, 
+//---------------------------------------------------------------------
+         for (k = start(3,c); k <= cell_size(3,c)-end(3,c)-1; k++) {
+            for (j = start(2,c); j <= cell_size(2,c)-end(2,c)-1; j++) {
+               for (i = start(1,c); i <= cell_size(1,c)-end(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     forcing(m,i,j,k,c) = -1.e0 * forcing(m,i,j,k,c);
+                  }
+               }
+            }
+         }
+
+      }
+
+      return;
+}

+ 43 - 0
RCCE_V2.0/apps/NPB/BT/exact_solution.c

@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void exact_solution(double xi,double eta,double zeta,double dtemp[]) {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     this function returns the exact solution at point xi, eta, zeta  
+//---------------------------------------------------------------------
+
+      int m;
+#define dtemp(m) dtemp[m-1]
+
+      for (m = 1; m <= 5; m++) {
+         dtemp(m) =  ce(m,1) +
+           xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
+           eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
+           zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + 
+           zeta*ce(m,13))));
+      }
+
+      return;
+}
+
+

+ 287 - 0
RCCE_V2.0/apps/NPB/BT/header.h

@@ -0,0 +1,287 @@
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+//
+//  header.h
+//
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+#ifndef __HEADER_H
+#define __HEADER_H
+
+//---------------------------------------------------------------------
+// The following include file is generated automatically by the
+// "setparams" utility. It defines 
+//      maxcells:      the square root of the maximum number of processors
+//      problem_size:  12, 64, 102, 162 (for class T, A, B, C)
+//      dt_default:    default time step for this problem size if no
+//                     config file
+//      niter_default: default number of iterations for this problem size
+//---------------------------------------------------------------------
+
+#include "npbparams.h"
+#include "RCCE.h"
+//we introduce the next definition to avoid confusing the compiler, which
+//sometimes thinks the variable class is a reserved word
+#define class _class_
+#include "../common/common.h"
+
+#define AA 0
+#define BB 1
+#define CC 2
+#define BLOCK_SIZE 5
+
+#define EAST   2000
+#define WEST   3000
+#define NORTH  4000
+#define SOUTH  5000
+#define BOTTOM 6000
+#define TOP    7000
+
+#define WESTDIR   0
+#define EASTDIR   1
+#define SOUTHDIR  2
+#define NORTHDIR  3
+#define BOTTOMDIR 4
+#define TOPDIR    5
+
+#define MAX_CELL_DIM ((PROBLEM_SIZE/MAXCELLS)+1)
+#define IMAX MAX_CELL_DIM
+#define JMAX MAX_CELL_DIM
+#define KMAX MAX_CELL_DIM
+
+#define BUF_SIZE (MAX_CELL_DIM*MAX_CELL_DIM*(MAXCELLS-1)*60+1)
+
+#define SQR(x) (x)*(x)
+
+#define grid_points(m) grid_points[m-1]
+#define ce(m,n) ce[(m-1)+5*(n-1)]
+#define cell_coord(m,n) cell_coord[(m-1)+3*(n-1)]
+#define cell_low(m,n) cell_low[(m-1)+3*(n-1)]
+#define cell_high(m,n) cell_high[(m-1)+3*(n-1)]
+#define cell_size(m,n) cell_size[(m-1)+3*(n-1)]
+#define predecessor(m) predecessor[m-1]
+#define slice(m,n) slice[(m-1)+3*(n-1)]
+#define grid_size(m) grid_size[m-1]
+#define successor(m) successor[m-1]
+#define start(m,n) start[(m-1)+3*(n-1)]
+#define end(m,n) end[(m-1)+3*(n-1)]
+#define us(i,j,k,c) us[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define vs(i,j,k,c) vs[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define ws(i,j,k,c) ws[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define qs(i,j,k,c) qs[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define rho_i(i,j,k,c) rho_i[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define square(i,j,k,c) square[(i+1)+(IMAX+2)*((j+1)+(JMAX+2)*((k+1)+(KMAX+2)*(c-1)))]
+#define forcing(m,i,j,k,c) forcing[(m-1)+5*(i+IMAX*(j+JMAX*(k+KMAX*(c-1))))]
+#define u(m,i,j,k,c) u[(m-1)+5*((i+2)+(IMAX+4)*((j+2)+(JMAX+4)*((k+2)+(KMAX+4)*(c-1))))]
+#define rhs(m,i,j,k,c) rhs[(m-1)+5*((i+1)+(IMAX+1)*((j+1)+(JMAX+1)*((k+1)+(KMAX+1)*(c-1))))]
+#define lhsc(m,n,i,j,k,c) lhsc[(m-1)+5*((n-1)+5*((i+1)+(IMAX+1)*((j+1)+(JMAX+1)*((k+1)+(KMAX+1)*(c-1)))))]
+#define backsub_info(m,i,j,c) backsub_info[(m-1)+5*((i)+(IMAX+1)*((j)+(JMAX+1)*(c-1)))]
+#define in_buffer(i) in_buffer[i-1]
+#define out_buffer(i) out_buffer[i-1]
+#define cv(m) cv[m+2]
+#define rhon(m) rhon[m+2]
+#define rhos(m) rhos[m+2]
+#define rhoq(m) rhoq[m+2]
+#define cuf(m) cuf[m+2]
+#define q(m) q[m+2]
+#define ue(m,n) ue[(m+2)+(MAX_CELL_DIM+4)*(n-1)]
+#define buf(m,n) buf[(m+2)+(MAX_CELL_DIM+4)*(n-1)]
+#define sum(m) sum[m-1]
+#define xce_sub(m) xce_sub[m-1]
+
+
+#ifdef G_MAIN
+      int     ncells, grid_points[3];
+      double  elapsed_time;
+
+      double  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
+                        dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+                        dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+                        ce[5*13], dxmax, dymax, dzmax, xxcon1, xxcon2, 
+                        xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+                        dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+                        yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+                        zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+                        dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+                        dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+                        c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+                        dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+                        c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+                        c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16;
+
+      int     cell_coord[MAXCELLS*3], cell_low[MAXCELLS*3], 
+              cell_high[MAXCELLS*3],  cell_size[MAXCELLS*3],
+              predecessor[3],         slice[MAXCELLS*3],
+              grid_size[3],           successor[3],
+              start[MAXCELLS*3],      end[MAXCELLS*3];
+
+      double 
+         us      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         vs      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         ws      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         qs      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         rho_i   [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         square  [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         forcing [5*IMAX*JMAX*KMAX*MAXCELLS],
+         u       [5*(IMAX+4)*(JMAX+4)*(KMAX+4)*MAXCELLS],
+         rhs     [5*(IMAX+1)*(JMAX+1)*(KMAX+1)*MAXCELLS],
+         lhsc    [5*5*(IMAX+1)*(JMAX+1)*(KMAX+1)*MAXCELLS],
+         backsub_info [5*(MAX_CELL_DIM+1)*(MAX_CELL_DIM+1)*MAXCELLS],
+         in_buffer[BUF_SIZE], out_buffer[BUF_SIZE];
+
+      double cv[MAX_CELL_DIM+4],   rhon[MAX_CELL_DIM+4],
+             rhos[MAX_CELL_DIM+4], rhoq[MAX_CELL_DIM+4],
+             cuf[MAX_CELL_DIM+4],  q[MAX_CELL_DIM+4],
+             ue[(MAX_CELL_DIM+4)*5], buf[(MAX_CELL_DIM+4)*5];
+
+      int  west_size, east_size, bottom_size, top_size,
+               north_size, south_size, start_send_west, 
+               start_send_east, start_send_south, start_send_north,
+               start_send_bottom, start_send_top, start_recv_west,
+               start_recv_east, start_recv_south, start_recv_north,
+               start_recv_bottom, start_recv_top;
+//
+//     These are used by btio
+//
+      int collbuf_nodes, collbuf_size, iosize,
+              idump, record_length,
+              idump_sub, rd_interval;
+      double sum[NITER_DEFAULT], xce_sub[5];
+      long int iseek;
+      int    send_color[6], recv_color[6];
+#else
+extern int     ncells, grid_points[3];
+extern double  elapsed_time;
+
+extern double  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
+                        dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+                        dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+                        ce[5*13], dxmax, dymax, dzmax, xxcon1, xxcon2, 
+                        xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+                        dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+                        yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+                        zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+                        dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+                        dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+                        c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+                        dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+                        c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+                        c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16;
+
+extern int    cell_coord[MAXCELLS*3], cell_low[MAXCELLS*3], 
+              cell_high[MAXCELLS*3],  cell_size[MAXCELLS*3],
+              predecessor[3],         slice[MAXCELLS*3],
+              grid_size[3],           successor[3],
+              start[MAXCELLS*3],      end[MAXCELLS*3];
+
+extern double 
+         us      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         vs      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         ws      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         qs      [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         rho_i   [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         square  [(IMAX+2)*(JMAX+2)*(KMAX+2)*MAXCELLS],
+         forcing [5*IMAX*JMAX*KMAX*MAXCELLS],
+         u       [5*(IMAX+4)*(JMAX+4)*(KMAX+4)*MAXCELLS],
+         rhs     [5*(IMAX+1)*(JMAX+1)*(KMAX+1)*MAXCELLS],
+         lhsc    [5*5*(IMAX+1)*(JMAX+1)*(KMAX+1)*MAXCELLS],
+         backsub_info [5*(MAX_CELL_DIM+1)*(MAX_CELL_DIM+1)*MAXCELLS],
+         in_buffer[BUF_SIZE], out_buffer[BUF_SIZE];
+
+extern double cv[MAX_CELL_DIM+4],   rhon[MAX_CELL_DIM+4],
+             rhos[MAX_CELL_DIM+4], rhoq[MAX_CELL_DIM+4],
+             cuf[MAX_CELL_DIM+4],  q[MAX_CELL_DIM+4],
+             ue[(MAX_CELL_DIM+4)*5], buf[(MAX_CELL_DIM+4)*5];
+
+extern int  west_size, east_size, bottom_size, top_size,
+               north_size, south_size, start_send_west, 
+               start_send_east, start_send_south, start_send_north,
+               start_send_bottom, start_send_top, start_recv_west,
+               start_recv_east, start_recv_south, start_recv_north,
+               start_recv_bottom, start_recv_top;
+
+//
+//     These are used by btio
+//
+extern int collbuf_nodes, collbuf_size, iosize,
+              idump, record_length,
+              idump_sub, rd_interval;
+extern double sum[NITER_DEFAULT], xce_sub[5];
+extern long int iseek;
+extern int    send_color[6], recv_color[6];
+
+#endif /*G_MAIN*/
+
+extern void matvec_sub(double ablock[], double avec[], double bvec[]);
+extern void matmul_sub(double ablock[], double bblock[], double cblock[]);
+extern void binvcrhs( double lhs[], double c[], double r[] );
+extern void binvrhs( double lhs[], double r[] );
+extern void exact_solution(double xi,double eta,double zeta,double dtemp[]);
+
+extern int setup_mpi(int *argc, char ***argv);
+extern void make_set(void);
+extern void set_constants(void);
+extern void lhsinit(void);
+extern void lhsabinit(double lhsa[], double lhsb[], int size);
+extern void initialize(void);
+extern void exact_rhs(void);
+extern void compute_buffer_size(int c);
+extern void adi(void);
+extern void compute_rhs(void);
+extern void copy_faces(void);
+extern void x_solve(void);
+extern void y_solve(void);
+extern void z_solve(void);
+extern void add(void);
+extern void verify(int niter, char *class, int *verified);
+extern void error_norm(double rms[]);
+extern void rhs_norm(double rms[]);
+
+extern void setup_btio(void);
+extern void output_timestep(void);
+extern void btio_cleanup(void);
+extern void btio_verify(int *verified);
+extern void accumulate_norms(double xce[]);
+extern void clear_timestep(void);
+
+#endif
+
+#ifdef _OPENMP
+#pragma omp threadprivate (cell_coord, cell_low, cell_high,  cell_size)
+#pragma omp threadprivate (predecessor, slice, grid_size, successor)
+#pragma omp threadprivate (start, end)
+
+#pragma omp threadprivate (ncells, grid_points, elapsed_time)
+#pragma omp threadprivate (tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, \
+                           dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, \
+                           dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, \
+                           ce, dxmax, dymax, dzmax, xxcon1, xxcon2, \
+                           xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, \
+                           dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, \
+                           yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, \
+                           zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, \
+                           dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, \
+                           dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, \
+                           c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, \
+                           dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, \
+                           c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, \
+                           c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16)
+
+#pragma omp threadprivate (us, vs, ws, qs, rho_i, square, forcing, \
+                           u, rhs, lhsc, backsub_info, in_buffer, out_buffer)
+
+#pragma omp threadprivate (cv, rhon, rhos, rhoq, cuf, q, ue, buf)
+
+#pragma omp threadprivate (west_size, east_size, bottom_size, top_size, \
+                           north_size, south_size, start_send_west, \
+                           start_send_east, start_send_south, start_send_north, \
+                           start_send_bottom, start_send_top, start_recv_west, \
+                           start_recv_east, start_recv_south, start_recv_north, \
+                           start_recv_bottom, start_recv_top, send_color, recv_color)
+//
+//     These are used by btio
+//
+#pragma omp threadprivate (collbuf_nodes, collbuf_size, iosize, idump,\
+                           record_length, idump_sub, rd_interval, \
+                           sum, xce_sub, iseek)
+#endif

+ 321 - 0
RCCE_V2.0/apps/NPB/BT/initialize.c

@@ -0,0 +1,321 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include "header.h"
+
+void  initialize() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     This subroutine initializes the field variable u using 
+//     tri-linear transfinite interpolation of the boundary values     
+//---------------------------------------------------------------------
+      
+      int c, i, j, k, m, ii, jj, kk, ix, iy, iz;
+      double xi, eta, zeta, Pface[5*3*2], Pxi, Peta, 
+           Pzeta, temp[5];
+#define Pface(m,n,i) Pface[(m-1)+5*((n-1)+3*(i-1))]
+#define temp(m) temp[m-1]
+
+//---------------------------------------------------------------------
+//  Later (in compute_rhs) we compute 1/u for every element. A few of 
+//  the corner elements are not used, but it convenient (and faster) 
+//  to compute the whole thing with a simple loop. Make sure those 
+//  values are nonzero by initializing the whole thing here. 
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+         for (kk = -1; kk <= KMAX; kk++) {
+            for (jj = -1; jj <= JMAX; jj++) {
+               for (ii = -1; ii <= IMAX; ii++) {
+                  for (m = 1; m <= 5; m++) {
+                     u(m, ii, jj, kk, c) = 1.0;
+                  }
+               }
+            }
+         }
+      }
+//---------------------------------------------------------------------
+
+
+
+//---------------------------------------------------------------------
+//     first store the "interpolated" values everywhere on the grid    
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+         kk = 0;
+         for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+            zeta = (double)(k) * dnzm1;
+            jj = 0;
+            for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+               eta = (double)(j) * dnym1;
+               ii = 0;
+               for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+                  xi = (double)(i) * dnxm1;
+                  
+                  for (ix = 1; ix <= 2; ix++) {
+                     exact_solution((double)(ix-1), eta, zeta, 
+                          &Pface(1,1,ix));
+                  }
+
+                  for (iy = 1; iy <= 2; iy++) {
+                     exact_solution(xi, (double)(iy-1) , zeta, 
+                          &Pface(1,2,iy));
+                  }
+
+                  for (iz = 1; iz <= 2; iz++) {
+                     exact_solution(xi, eta, (double)(iz-1),   
+                          &Pface(1,3,iz));
+                  }
+
+                  for (m = 1; m <= 5; m++) {
+                     Pxi   = xi   * Pface(m,1,2) + 
+                          (1.0e0-xi)   * Pface(m,1,1);
+                     Peta  = eta  * Pface(m,2,2) + 
+                          (1.0e0-eta)  * Pface(m,2,1);
+                     Pzeta = zeta * Pface(m,3,2) + 
+                          (1.0e0-zeta) * Pface(m,3,1);
+                     
+                     u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta - 
+                          Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + 
+                          Pxi*Peta*Pzeta;
+
+                  }
+                  ii = ii + 1;
+               }
+               jj = jj + 1;
+            }
+            kk = kk+1;
+         }
+      }
+
+//---------------------------------------------------------------------
+//     now store the exact values on the boundaries        
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     west face                                                  
+//---------------------------------------------------------------------
+      c = slice(1,1);
+      ii = 0;
+      xi = 0.0e0;
+      kk = 0;
+      for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+         zeta = (double)(k) * dnzm1;
+         jj = 0;
+         for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+            eta = (double)(j) * dnym1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            jj = jj + 1;
+         }
+         kk = kk + 1;
+      }
+
+//---------------------------------------------------------------------
+//     east face                                                      
+//---------------------------------------------------------------------
+      c  = slice(1,ncells);
+      ii = cell_size(1,c)-1;
+      xi = 1.0e0;
+      kk = 0;
+      for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+         zeta = (double)(k) * dnzm1;
+         jj = 0;
+         for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+            eta = (double)(j) * dnym1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            jj = jj + 1;
+         }
+         kk = kk + 1;
+      }
+
+//---------------------------------------------------------------------
+//     south face                                                 
+//---------------------------------------------------------------------
+      c = slice(2,1);
+      jj = 0;
+      eta = 0.0e0;
+      kk = 0;
+      for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+         zeta = (double)(k) * dnzm1;
+         ii = 0;
+         for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+            xi = (double)(i) * dnxm1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            ii = ii + 1;
+         }
+         kk = kk + 1;
+      }
+
+
+//---------------------------------------------------------------------
+//     north face                                    
+//---------------------------------------------------------------------
+      c = slice(2,ncells);
+      jj = cell_size(2,c)-1;
+      eta = 1.0e0;
+      kk = 0;
+      for (k = cell_low(3,c); k <= cell_high(3,c); k++) {
+         zeta = (double)(k) * dnzm1;
+         ii = 0;
+         for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+            xi = (double)(i) * dnxm1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            ii = ii + 1;
+         }
+         kk = kk + 1;
+      }
+
+//---------------------------------------------------------------------
+//     bottom face                                       
+//---------------------------------------------------------------------
+      c = slice(3,1);
+      kk = 0;
+      zeta = 0.0e0;
+      jj = 0;
+      for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+         eta = (double)(j) * dnym1;
+         ii = 0;
+         for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+            xi = (double)(i) *dnxm1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            ii = ii + 1;
+         }
+         jj = jj + 1;
+      }
+
+//---------------------------------------------------------------------
+//     top face     
+//---------------------------------------------------------------------
+      c = slice(3,ncells);
+      kk = cell_size(3,c)-1;
+      zeta = 1.0e0;
+      jj = 0;
+      for (j = cell_low(2,c); j <= cell_high(2,c); j++) {
+         eta = (double)(j) * dnym1;
+         ii = 0;
+         for (i = cell_low(1,c); i <= cell_high(1,c); i++) {
+            xi = (double)(i) * dnxm1;
+            exact_solution(xi, eta, zeta, temp);
+            for (m = 1; m <= 5; m++) {
+               u(m,ii,jj,kk,c) = temp(m);
+            }
+            ii = ii + 1;
+         }
+         jj = jj + 1;
+      }
+
+      return;
+}
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void lhsinit() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+      
+      int i, j, k, d, c, m, n;
+
+//---------------------------------------------------------------------
+//     loop over all cells                                       
+//---------------------------------------------------------------------
+      for (c = 1; c <= ncells; c++) {
+
+//---------------------------------------------------------------------
+//     first, initialize the start and end arrays
+//---------------------------------------------------------------------
+         for (d = 1; d <= 3; d++) {
+            if (cell_coord(d,c) == 1) {
+               start(d,c) = 1;
+            } else {
+               start(d,c) = 0;
+            }
+            if (cell_coord(d,c) == ncells) {
+               end(d,c) = 1;
+            } else {
+               end(d,c) = 0;
+            }
+         }
+
+//---------------------------------------------------------------------
+//     zero the whole left hand side for starters
+//---------------------------------------------------------------------
+         for (k = 0; k <= cell_size(3,c)-1; k++) {
+            for (j = 0; j <= cell_size(2,c)-1; j++) {
+               for (i = 0; i <= cell_size(1,c)-1; i++) {
+                  for (m = 1; m <= 5; m++) {
+                     for (n = 1; n <= 5; n++) {
+                        lhsc(m,n,i,j,k,c) = 0.0e0;
+                     }
+                  }
+               }
+            }
+         }
+
+      }
+
+      return;
+}
+
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+void lhsabinit(double lhsa[], double lhsb[], int size) {
+
+#define lhsa(m,n,i) lhsa[(m-1)+5*((n-1)+5*(i+1))]
+#define lhsb(m,n,i) lhsb[(m-1)+5*((n-1)+5*(i+1))]
+
+      int i, m, n;
+
+//---------------------------------------------------------------------
+//     next, set all diagonal values to 1. This is overkill, but convenient
+//---------------------------------------------------------------------
+      for (i = 0; i <= size; i++) {
+         for (m = 1; m <= 5; m++) {
+            for (n = 1; n <= 5; n++) {
+               lhsa(m,n,i) = 0.0e0;
+               lhsb(m,n,i) = 0.0e0;
+            }
+            lhsb(m,m,i) = 1.0e0;
+         }
+      }
+
+      return;
+}
+
+
+

+ 5 - 0
RCCE_V2.0/apps/NPB/BT/inputbt.data.sample

@@ -0,0 +1,5 @@
+200       number of time steps
+0.0008d0  dt for class A = 0.0008d0. class B = 0.0003d0  class C = 0.0001d0
+64 64 64
+5 0        write interval (optional read interval) for BTIO
+0 1000000  number of nodes in collective buffering and buffer size for BTIO

+ 222 - 0
RCCE_V2.0/apps/NPB/BT/make_set.c

@@ -0,0 +1,222 @@
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "header.h"
+#include "mpinpb.h"
+
+#define mod(p,q) ((p)%(q))
+#define max(x,y)      ((x)>(y)? (x) : (y))
+#define min(x,y)      ((x)<(y)? (x) : (y))
+
+void make_set() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     This function allocates space for a set of cells and fills the set
+//     such that communication between cells on different nodes is only
+//     nearest neighbor
+//---------------------------------------------------------------------
+
+
+      int p, i, j, c, dir, size, excess, ierr,ierrcode;
+
+//---------------------------------------------------------------------
+//     compute square root; add small number to allow for roundoff
+//     (note: this is computed in setup_mpi.f also, but prefer to do
+//     it twice because of some include file problems).
+//---------------------------------------------------------------------
+      ncells = (int)(sqrt((double)(no_nodes) + 0.00001e0));
+
+//---------------------------------------------------------------------
+//     this makes coding easier
+//---------------------------------------------------------------------
+      p = ncells;
+      
+//---------------------------------------------------------------------
+//     determine the location of the cell at the bottom of the 3D 
+//     array of cells
+//---------------------------------------------------------------------
+      cell_coord(1,1) = mod(node,p) ;
+      cell_coord(2,1) = node/p ;
+      cell_coord(3,1) = 0;
+
+//---------------------------------------------------------------------
+//     set the cell_coords for cells in the rest of the z-layers; 
+//     this comes down to a simple linear numbering in the z-direct-
+//     ion, and to the doubly-cyclic numbering in the other dirs     
+//---------------------------------------------------------------------
+      for (c = 2; c <= p; c++) {
+         cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) ;
+         cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) ;
+         cell_coord(3,c) = c-1;
+      }
+
+//---------------------------------------------------------------------
+//     offset all the coordinates by 1 to adjust for Fortran arrays
+//---------------------------------------------------------------------
+      for (dir = 1; dir <= 3; dir++) {
+         for (c = 1; c <= p; c++) {
+            cell_coord(dir,c) = cell_coord(dir,c) + 1;
+         }
+      }
+      
+//---------------------------------------------------------------------
+//     slice(dir,n) contains the sequence number of the cell that is in
+//     coordinate plane n in the dir direction
+//---------------------------------------------------------------------
+      for (dir = 1; dir <= 3; dir++) {
+         for (c = 1; c <= p; c++) {
+            slice(dir,cell_coord(dir,c)) = c;
+         }
+      }
+
+
+//---------------------------------------------------------------------
+//     fill the predecessor and successor entries, using the indices 
+//     of the bottom cells (they are the same at each level of k 
+//     anyway) acting as if full periodicity pertains; note that p is
+//     added to those arguments to the mod functions that might
+//     otherwise return wrong values when using the modulo function
+//---------------------------------------------------------------------
+      i = cell_coord(1,1)-1;
+      j = cell_coord(2,1)-1;
+
+      predecessor(1) = mod(i-1+p,p) + p*j;
+      predecessor(2) = i + p*mod(j-1+p,p);
+      predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p);
+      successor(1)   = mod(i+1,p) + p*j;
+      successor(2)   = i + p*mod(j+1,p);
+      successor(3)   = mod(i-1+p,p) + p*mod(j+1,p);
+
+//---------------------------------------------------------------------
+//     now compute the sizes of the cells                                
+//---------------------------------------------------------------------
+      for (dir = 1; dir <= 3; dir++) {
+//---------------------------------------------------------------------
+//     set cell_coord range for each direction                           
+//---------------------------------------------------------------------
+         size   = grid_points(dir)/p;
+         excess = mod(grid_points(dir),p);
+         for (c = 1; c <= ncells; c++) {
+            if (cell_coord(dir,c) <= excess) {
+               cell_size(dir,c) = size+1;
+               cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1);
+               cell_high(dir,c) = cell_low(dir,c)+size;
+            } else {
+               cell_size(dir,c) = size;
+               cell_low(dir,c)  = excess*(size+1)+
+                    (cell_coord(dir,c)-excess-1)*size;
+               cell_high(dir,c) = cell_low(dir,c)+size-1;
+            }
+            if (cell_size(dir, c) <= 2) {
+               printf(" Error: Cell size too small. Min size is 3\n");
+               ierrcode = 1;
+               exit(1);
+            }
+         }
+      }
+
+      return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+
+void make_color() {
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+//---------------------------------------------------------------------
+//     This function determines cycles in the communication graphs in
+//     the six coordinate directions, and colors the ranks so they know
+//     how to construct deadlock-free blocking communication schedules
+//---------------------------------------------------------------------
+
+      int p, i, j, dir, node_loc, comm_color, node_min, length, start_found;
+
+//---------------------------------------------------------------------
+//     compute square root; add small number to allow for roundoff
+//     (note: this is computed in setup_mpi.f also, but prefer to do
+//     it twice because of some include file problems).
+//---------------------------------------------------------------------
+      ncells = (int)(sqrt((double)(no_nodes) + 0.00001e0));
+
+//---------------------------------------------------------------------
+//     this makes coding easier
+//---------------------------------------------------------------------
+      p = ncells;
+
+      for (dir = 0; dir<6; dir++) {
+
+        node_loc = node_min = node; length = 1; start_found = 0;
+        while (!start_found) {
+          i = mod(node_loc,p) ;
+          j = node_loc/p ;
+
+          switch (dir) {
+            case (WESTDIR):   node_loc = mod(i-1+p,p) + p*j;          break;
+            case (EASTDIR):   node_loc = mod(i+1,p) + p*j;            break;
+            case (SOUTHDIR):  node_loc = i + p*mod(j-1+p,p);          break;
+            case (NORTHDIR):  node_loc = i + p*mod(j+1,p);            break;
+            case (BOTTOMDIR): node_loc = mod(i+1,p) + p*mod(j-1+p,p); break;
+            case (TOPDIR):    node_loc = mod(i-1+p,p) + p*mod(j+1,p); break;
+          }
+
+          // the next block ensures that the node with the lowest rank
+          // in this cycle is colored WHITE (=0), and that nodes an even
+          // number of jumps removed from that lowest-ranked member
+          // are also white. The others are RED (1).
+          if (node_loc <= node_min) {
+            node_min = node_loc;
+            comm_color = 0;
+          } else comm_color = !comm_color;
+          if (node_loc == node) start_found = 1;
+          else length++;
+        }
+        send_color[dir] = comm_color;
+        recv_color[dir] = !send_color[dir];
+        // if the number of nodes in this cycle is odd, we need to treat the 
+        // last node before the "start" of the cycle differently
+        if (length%2) {
+          if (node == node_min) recv_color[dir] = 2;
+          i = mod(node,p) ;
+          j = node/p ;
+          switch (dir) {
+            case (WESTDIR):   node_loc = mod(i-1+p,p) + p*j;          break;
+            case (EASTDIR):   node_loc = mod(i+1,p) + p*j;            break;
+            case (SOUTHDIR):  node_loc = i + p*mod(j-1+p,p);          break;
+            case (NORTHDIR):  node_loc = i + p*mod(j+1,p);            break;
+            case (BOTTOMDIR): node_loc = mod(i+1,p) + p*mod(j-1+p,p); break;
+            case (TOPDIR):    node_loc = mod(i-1+p,p) + p*mod(j+1,p); break;
+          }      
+          if (node_loc == node_min) send_color[dir] = 2;
+        }
+      }
+     return;
+}
+
+//---------------------------------------------------------------------
+//---------------------------------------------------------------------
+
+

+ 34 - 0
RCCE_V2.0/apps/NPB/BT/mpinpb.h

@@ -0,0 +1,34 @@
+
+//---------------------------------------------------------------------
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+//---------------------------------------------------------------------
+#ifndef __MPINPB_H
+#define __MPINPB_H
+
+#ifdef G_MAIN
+       int           node, no_nodes, total_nodes, root;
+       int           active;
+#else
+extern int           node, no_nodes, total_nodes, root;
+extern int           active;
+
+#endif
+#ifdef _OPENMP
+#pragma omp threadprivate (node, no_nodes, total_nodes, root, active)
+#endif
+#endif
+

+ 104 - 0
RCCE_V2.0/apps/NPB/BT/print_results.c

@@ -0,0 +1,104 @@
+/*****************************************************************/
+/******     C  _  P  R  I  N  T  _  R  E  S  U  L  T  S     ******/
+/*****************************************************************/
+// 
+// Copyright 2010 Intel Corporation
+// 
+//    Licensed under the Apache License, Version 2.0 (the "License");
+//    you may not use this file except in compliance with the License.
+//    You may obtain a copy of the License at
+// 
+//        http://www.apache.org/licenses/LICENSE-2.0
+// 
+//    Unless required by applicable law or agreed to in writing, software
+//    distributed under the License is distributed on an "AS IS" BASIS,
+//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//    See the License for the specific language governing permissions and
+//    limitations under the License.
+// 
+#include <stdlib.h>
+#include <stdio.h>
+#define class _class_
+
+void print_results( char   *name,
+                      char   class,
+                      int    n1, 
+                      int    n2,
+                      int    n3,
+                      int    niter,
+                      int    nprocs_compiled,
+                      int    nprocs_total,
+                      double t,
+                      double mops,
+		      char   *optype,
+                      int    passed_verification,
+                      char   *npbversion,
+                      char   *compiletime,
+                      char   *mpicc,
+                      char   *clink,
+                      char   *cmpi_lib,
+                      char   *cmpi_inc,
+                      char   *cflags,
+                      char   *clinkflags )
+{
+    char *evalue="1000";
+
+    printf( "\n\n %s Benchmark Completed\n", name ); 
+
+    printf( " Class           =                        %c\n", class );
+
+    printf( " Size            =            %3dx %3dx %3d\n", n1,n2,n3 );
+
+    printf( " Iterations      =             %12d\n", niter );
+ 
+    printf( " Time in seconds =             %12.2f\n", t );
+
+    printf( " Total processes =             %12d\n", nprocs_total );
+
+    if ( nprocs_compiled != 0 )
+        printf( " Compiled procs  =             %12d\n", nprocs_compiled );
+
+    printf( " Mop/s total     =             %12.2f\n", mops );
+
+    printf( " Mop/s/process   =             %12.2f\n", mops/((float) nprocs_total) );
+
+    printf( " Operation type  = %24s\n", optype);
+
+    if( passed_verification )
+        printf( " Verification    =               SUCCESSFUL\n" );
+    else
+        printf( " Verification    =             UNSUCCESSFUL\n" );
+
+    printf( " Version         =             %12s\n", npbversion );
+
+    printf( " Compile date    =             %12s\n", compiletime );
+
+    printf( "\n Compile options:\n" );
+
+    printf( "    MPICC        = %s\n", mpicc );
+
+    printf( "    CLINK        = %s\n", clink );
+
+    printf( "    CMPI_LIB     = %s\n", cmpi_lib );
+
+    printf( "    CMPI_INC     = %s\n", cmpi_inc );
+
+    printf( "    CFLAGS       = %s\n", cflags );
+
+    printf( "    CLINKFLAGS   = %s\n", clinkflags );
+#ifdef SMP
+    evalue = getenv("MP_SET_NUMTHREADS");
+    printf( "   MULTICPUS = %s\n", evalue );
+#endif
+
+    printf( "\n\n" );
+    printf( " Please send the results of this run to:\n\n" );
+    printf( " NPB Development Team\n" );
+    printf( " Internet: npb@nas.nasa.gov\n \n" );
+    printf( " If email is not available, send this to:\n\n" );
+    printf( " MS T27A-1\n" );
+    printf( " NASA Ames Research Center\n" );
+    printf( " Moffett Field, CA  94035-1000\n\n" );
+    printf( " Fax: 650-604-3957\n\n" );
+}
+ 

+ 0 - 0
RCCE_V2.0/apps/NPB/BT/rhs.c


Certains fichiers n'ont pas été affichés car il y a eu trop de fichiers modifiés dans ce diff