Browse Source

Prefixing of objects defined in starpu_config_h.in

find . -type f -not -name "*svn*"|xargs sed -i s/"\bUSE_CUDA\b"/STARPU_USE_CUDA/g
find . -type f -not -name "*svn*"|xargs sed -i s/"\bUSE_GORDON\b"/STARPU_USE_GORDON/g
find . -type f -not -name "*svn*"|xargs sed -i s/"\bATLAS\b"/STARPU_ATLAS/g
find . -type f -not -name "*svn*"|xargs sed -i s/"\bGOTO\b"/STARPU_GOTO/g
find . -type f -not -name "*svn*"|xargs sed -i s/"\bSYSTEM_BLAS\b"/STARPU_SYSTEM_BLAS/g
Nathalie Furmento 15 years ago
parent
commit
3739cfa184
100 changed files with 451 additions and 451 deletions
  1. 18 18
      configure.ac
  2. 3 3
      examples/Makefile.am
  3. 4 4
      examples/audio/starpu-audio-processing.c
  4. 3 3
      examples/axpy/axpy.c
  5. 3 3
      examples/cholesky/dw_cholesky.c
  6. 2 2
      examples/cholesky/dw_cholesky.h
  7. 3 3
      examples/cholesky/dw_cholesky_grain.c
  8. 10 10
      examples/cholesky/dw_cholesky_kernels.c
  9. 6 6
      examples/cholesky/dw_cholesky_no_stride.c
  10. 3 3
      examples/common/blas.c
  11. 2 2
      examples/common/blas.h
  12. 2 2
      examples/common/blas_model.h
  13. 1 1
      examples/gordon/null.h
  14. 4 4
      examples/heat/dw_factolu.c
  15. 3 3
      examples/heat/dw_factolu.h
  16. 4 4
      examples/heat/dw_factolu_grain.c
  17. 14 14
      examples/heat/dw_factolu_kernels.c
  18. 4 4
      examples/heat/dw_factolu_tag.c
  19. 6 6
      examples/heat/dw_sparse_cg.c
  20. 1 1
      examples/heat/dw_sparse_cg.h
  21. 6 6
      examples/heat/dw_sparse_cg_kernels.c
  22. 8 8
      examples/heat/lu_kernels_model.c
  23. 2 2
      examples/incrementer/incrementer.c
  24. 12 12
      examples/lu/xlu.c
  25. 2 2
      examples/lu/xlu.h
  26. 20 20
      examples/lu/xlu_kernels.c
  27. 1 1
      examples/lu/xlu_kernels.h
  28. 15 15
      examples/lu/xlu_pivot.c
  29. 3 3
      examples/mult/dw_mult.c
  30. 2 2
      examples/mult/dw_mult.h
  31. 8 8
      examples/mult/dw_mult_no_stride.c
  32. 7 7
      examples/mult/dw_mult_no_stride_no_tag.c
  33. 1 1
      examples/mult/sgemm_kernels.c
  34. 4 4
      examples/mult/xgemm.c
  35. 1 1
      examples/mult/xgemm_kernels.c
  36. 1 1
      examples/pastix-wrappers/starpu-blas-wrapper.c
  37. 1 1
      examples/spmv/dw_block_spmv.c
  38. 3 3
      examples/spmv/dw_block_spmv.h
  39. 3 3
      examples/spmv/dw_block_spmv_kernels.c
  40. 3 3
      examples/spmv/dw_spmv.c
  41. 1 1
      examples/starpufft/Makefile.am
  42. 2 2
      examples/starpufft/double.h
  43. 2 2
      examples/starpufft/float.h
  44. 7 7
      examples/starpufft/starpufftx.c
  45. 9 9
      examples/starpufft/starpufftx1d.c
  46. 10 10
      examples/starpufft/starpufftx2d.c
  47. 9 9
      examples/starpufft/testx.c
  48. 5 5
      examples/strassen/strassen.c
  49. 2 2
      examples/strassen/strassen.h
  50. 6 6
      examples/strassen/strassen_kernels.c
  51. 9 9
      examples/strassen2/strassen2.c
  52. 10 10
      examples/strassen2/strassen2_kernels.c
  53. 3 3
      examples/tag_example/tag_example.c
  54. 3 3
      examples/tag_example/tag_example2.c
  55. 3 3
      examples/tag_example/tag_example3.c
  56. 3 3
      examples/tag_example/tag_restartable.c
  57. 1 1
      include/starpu-task.h
  58. 4 4
      include/starpu-util.h
  59. 6 6
      include/starpu_config.h.in
  60. 2 2
      mpi/Makefile.am
  61. 1 1
      mpi/examples/mpi_lu/pxlu.h
  62. 26 26
      mpi/examples/mpi_lu/pxlu_kernels.c
  63. 2 2
      mpi/tests/ring.c
  64. 2 2
      mpi/tests/ring_async.c
  65. 3 3
      src/Makefile.am
  66. 1 1
      src/core/jobs.h
  67. 1 1
      src/core/perfmodel/perfmodel.h
  68. 6 6
      src/core/perfmodel/perfmodel_bus.c
  69. 6 6
      src/core/topology.c
  70. 4 4
      src/core/workers.c
  71. 2 2
      src/core/workers.h
  72. 7 7
      src/datawizard/copy-driver.c
  73. 3 3
      src/datawizard/copy-driver.h
  74. 8 8
      src/datawizard/interfaces/bcsr_interface.c
  75. 12 12
      src/datawizard/interfaces/blas_interface.c
  76. 11 11
      src/datawizard/interfaces/block_interface.c
  77. 8 8
      src/datawizard/interfaces/csr_interface.c
  78. 2 2
      src/datawizard/interfaces/data_interface.h
  79. 11 11
      src/datawizard/interfaces/vector_interface.c
  80. 1 1
      src/datawizard/memory_nodes.h
  81. 2 2
      src/drivers/cuda/driver_cuda.h
  82. 2 2
      src/task-models/blas_model.c
  83. 5 5
      src/util/malloc.c
  84. 3 3
      src/util/starpu_cublas.c
  85. 4 4
      tests/Makefile.am
  86. 2 2
      tests/core/starpu_wait_all_tasks.c
  87. 7 7
      tests/datawizard/sync_and_notify_data.c
  88. 1 1
      tests/datawizard/sync_with_data_with_mem.c
  89. 1 1
      tests/datawizard/sync_with_data_with_mem_non_blocking.c
  90. 1 1
      tests/datawizard/unpartition.c
  91. 2 2
      tests/datawizard/write_only_tmp_buffer.c
  92. 1 1
      tests/heat/deps.sh
  93. 1 1
      tests/heat/gflops-sched.sh
  94. 1 1
      tests/heat/gflops.sh
  95. 1 1
      tests/heat/heat.sh
  96. 1 1
      tests/heat/model-perturbation.sh
  97. 1 1
      tests/heat/speedup.sh
  98. 1 1
      tests/memory/memstress.sh
  99. 1 1
      tests/memory/memstress2.sh
  100. 0 0
      tests/microbenchs/async-tasks-overhead.c

+ 18 - 18
configure.ac

@@ -127,8 +127,8 @@ AC_ARG_ENABLE(cpu, [AS_HELP_STRING([--disable-cpu],
 			[do not use the CPU(s)])],
 			enable_cpu=$enableval, enable_cpu=yes)
 AC_MSG_RESULT($enable_cpu)
-AC_SUBST(USE_CPU, $enable_cpu)
-AM_CONDITIONAL(USE_CPU, test x$enable_cpu = xyes)
+AC_SUBST(STARPU_USE_CPU, $enable_cpu)
+AM_CONDITIONAL(STARPU_USE_CPU, test x$enable_cpu = xyes)
 
 if test x$enable_cpu = xyes; then
 	AC_DEFINE(USE_CPUS, [1], [CPU driver is activated])
@@ -206,10 +206,10 @@ fi
 
 AC_MSG_CHECKING(whether CUDA should be used)
 AC_MSG_RESULT($enable_cuda)
-AC_SUBST(USE_CUDA, $enable_cuda)
-AM_CONDITIONAL(USE_CUDA, test x$enable_cuda = xyes)
+AC_SUBST(STARPU_USE_CUDA, $enable_cuda)
+AM_CONDITIONAL(STARPU_USE_CUDA, test x$enable_cuda = xyes)
 if test x$enable_cuda = xyes; then
-	AC_DEFINE(USE_CUDA, [1], [CUDA support is activated])
+	AC_DEFINE(STARPU_USE_CUDA, [1], [CUDA support is activated])
 
 	#in case this is a 64bit setup, we tell nvcc to use a -m64 flag
 	AC_CHECK_SIZEOF([void *])
@@ -271,11 +271,11 @@ fi
 
 AC_MSG_CHECKING(whether GORDON should be used)
 AC_MSG_RESULT($enable_gordon)
-AC_SUBST(USE_GORDON, $enable_gordon)
-AM_CONDITIONAL(USE_GORDON, test x$enable_gordon = xyes)
+AC_SUBST(STARPU_USE_GORDON, $enable_gordon)
+AM_CONDITIONAL(STARPU_USE_GORDON, test x$enable_gordon = xyes)
 
 if test x$enable_gordon = xyes; then
-	AC_DEFINE(USE_GORDON, [1], [Cell support is enabled])
+	AC_DEFINE(STARPU_USE_GORDON, [1], [Cell support is enabled])
 	AC_DEFINE(NON_BLOCKING_DRIVERS, [1], [drivers must progress])
 fi
 
@@ -575,8 +575,8 @@ blas_lib=maybe
 AC_ARG_ENABLE(blas-lib,
  [  --enable-blas-lib[=blaslibname]:
                       none [default]: no BLAS lib is used
-                      atlas: use ATLAS library
-                      goto: use GOTO library],
+                      atlas: use STARPU_ATLAS library
+                      goto: use STARPU_GOTO library],
  [ 
      if   test "x$enableval" = "xatlas" ; then
         blas_lib=atlas
@@ -595,7 +595,7 @@ AC_ARG_ENABLE(blas-lib,
  ])
 
 if test x$blas_lib = xmaybe -o x$blas_lib = xgoto; then
-AC_ARG_WITH(goto-dir, [AS_HELP_STRING([--with-goto-dir=<dir>], [specify GOTO lib location])],
+AC_ARG_WITH(goto-dir, [AS_HELP_STRING([--with-goto-dir=<dir>], [specify STARPU_GOTO lib location])],
 	[
 		blas_lib=goto
 		gotodir=$withval
@@ -607,17 +607,17 @@ AC_ARG_WITH(goto-dir, [AS_HELP_STRING([--with-goto-dir=<dir>], [specify GOTO lib
 	)
 
 if test x$blas_lib = xgoto; then
-# test whether ATLAS is actually available
+# test whether STARPU_ATLAS is actually available
 AC_CHECK_LIB(goto, sgemm_,,AC_MSG_ERROR([cannot find goto lib]))
-AC_DEFINE(GOTO, [1], [use GOTO library])
+AC_DEFINE(STARPU_GOTO, [1], [use STARPU_GOTO library])
 fi
 
 fi
      
 if test x$blas_lib = xmaybe -o x$blas_lib = xatlas; then
-AC_ARG_WITH(atlas-dir, [AS_HELP_STRING([--with-atlas-dir=<dir>], [specify ATLAS lib location])],
+AC_ARG_WITH(atlas-dir, [AS_HELP_STRING([--with-atlas-dir=<dir>], [specify STARPU_ATLAS lib location])],
 	[
-		AC_MSG_CHECKING(ATLAS location)
+		AC_MSG_CHECKING(STARPU_ATLAS location)
 		blas_lib=atlas
 		atlasdir=$withval
 		AC_MSG_RESULT($atlasdir)
@@ -629,11 +629,11 @@ AC_ARG_WITH(atlas-dir, [AS_HELP_STRING([--with-atlas-dir=<dir>], [specify ATLAS
 	)
 
 if test x$blas_lib = xatlas; then
-# test whether ATLAS is actually available
+# test whether STARPU_ATLAS is actually available
 AC_CHECK_HEADER([cblas.h],,AC_MSG_ERROR([cannot find atlas headers]))
 AC_CHECK_LIB(atlas, ATL_sgemm,,AC_MSG_ERROR([cannot find atlas lib]),)
 AC_CHECK_LIB(cblas, cblas_sgemm,,AC_MSG_ERROR([cannot find atlas lib]),[-latlas])
-AC_DEFINE(ATLAS, [1], [use ATLAS library])
+AC_DEFINE(STARPU_ATLAS, [1], [use STARPU_ATLAS library])
 fi
 
 fi
@@ -643,7 +643,7 @@ if test x$blas_lib = xmaybe; then
      use_system_blas=no
      AC_SEARCH_LIBS([sgemm_],[blas],use_system_blas=yes,,)
      if test x$use_system_blas = xyes; then
-        AC_DEFINE(SYSTEM_BLAS, [1], [use refblas library])
+        AC_DEFINE(STARPU_SYSTEM_BLAS, [1], [use refblas library])
 	blas_lib=system
      else
 	blas_lib=none

+ 3 - 3
examples/Makefile.am

@@ -48,7 +48,7 @@ CLEANFILES = 					\
 
 CLEANFILES += *.gcno *.gcda *.linkinfo
 
-if USE_CUDA
+if STARPU_USE_CUDA
 
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
@@ -63,7 +63,7 @@ NVCC ?= nvcc
 
 endif
 
-if USE_GORDON
+if STARPU_USE_GORDON
 
 SPU_CC ?= spu-gcc
 SPU_LD ?= spu-ld
@@ -384,7 +384,7 @@ check_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
 	incrementer/incrementer
 
-if USE_CUDA
+if STARPU_USE_CUDA
 incrementer_incrementer_SOURCES =	\
 	incrementer/incrementer.c	\
 	incrementer/incrementer_kernels.cu

+ 4 - 4
examples/audio/starpu-audio-processing.c

@@ -24,7 +24,7 @@
 
 #include <starpu.h>
 #include <fftw3.h>
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cufft.h>
 #endif
 
@@ -146,7 +146,7 @@ void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_fi
 /* we don't reinitialize the CUFFT plan for every kernel, so we "cache" it */
 typedef struct {
 	unsigned is_initialized;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cufftHandle plan;
 	cufftHandle inv_plan;
 	cufftComplex *localout;
@@ -159,7 +159,7 @@ typedef struct {
 
 static fft_plan_cache plans[STARPU_NMAXWORKERS];
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *arg)
 {
 	cufftResult cures;
@@ -273,7 +273,7 @@ struct starpu_perfmodel_t band_filter_model = {
 
 static starpu_codelet band_filter_cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = band_filter_kernel_gpu,
 #endif
 	.cpu_func = band_filter_kernel_cpu,

+ 3 - 3
examples/axpy/axpy.c

@@ -48,7 +48,7 @@ void axpy_cpu(void *descr[], __attribute__((unused)) void *arg)
 	AXPY((int)n, alpha, block_x, 1, block_y, 1);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
 {
 	TYPE alpha = *((TYPE *)arg);
@@ -65,13 +65,13 @@ void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
 
 static starpu_codelet axpy_cl = {
         .where =
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
                 STARPU_CUDA|
 #endif
                 STARPU_CPU,
 
 	.cpu_func = axpy_cpu,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = axpy_gpu,
 #endif
 	.nbuffers = 2

+ 3 - 3
examples/cholesky/dw_cholesky.c

@@ -39,7 +39,7 @@ static starpu_codelet cl11 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = chol_cpu_codelet_update_u11,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u11,
 #endif
 	.nbuffers = 1,
@@ -74,7 +74,7 @@ static starpu_codelet cl21 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = chol_cpu_codelet_update_u21,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u21,
 #endif
 	.nbuffers = 2,
@@ -112,7 +112,7 @@ static starpu_codelet cl22 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = chol_cpu_codelet_update_u22,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u22,
 #endif
 	.nbuffers = 3,

+ 2 - 2
examples/cholesky/dw_cholesky.h

@@ -21,7 +21,7 @@
 #include <string.h>
 #include <math.h>
 #include <sys/time.h>
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas.h>
@@ -76,7 +76,7 @@ void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
 void chol_cpu_codelet_update_u22(void **, void *);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void chol_cublas_codelet_update_u11(void *descr[], void *_args);
 void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 void chol_cublas_codelet_update_u22(void *descr[], void *_args);

+ 3 - 3
examples/cholesky/dw_cholesky_grain.c

@@ -39,7 +39,7 @@ static starpu_codelet cl11 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = chol_cpu_codelet_update_u11,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u11,
 #endif
 	.nbuffers = 1,
@@ -73,7 +73,7 @@ static starpu_codelet cl21 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = chol_cpu_codelet_update_u21,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u21,
 #endif
 	.nbuffers = 2,
@@ -111,7 +111,7 @@ static starpu_codelet cl22 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = chol_cpu_codelet_update_u22,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u22,
 #endif
 	.nbuffers = 3,

+ 10 - 10
examples/cholesky/dw_cholesky_kernels.c

@@ -17,7 +17,7 @@
 #include <starpu_config.h>
 #include "dw_cholesky.h"
 #include "../common/blas.h"
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas.h>
@@ -42,7 +42,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 	unsigned ld12 = STARPU_GET_BLAS_LD(descr[1]);
 	unsigned ld22 = STARPU_GET_BLAS_LD(descr[2]);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus st;
 #endif
 
@@ -51,7 +51,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
 				right, ld12, 1.0f, center, ld22);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublasSgemm('n', 't', dy, dx, dz, 
 					-1.0f, left, ld21, right, ld12, 
@@ -74,12 +74,12 @@ void chol_cpu_codelet_update_u22(void *descr[], void *_args)
 	chol_common_cpu_codelet_update_u22(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 {
 	chol_common_cpu_codelet_update_u22(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA
 
 /* 
  * U21
@@ -104,7 +104,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 		case 0:
 			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			cudaThreadSynchronize();
@@ -121,7 +121,7 @@ void chol_cpu_codelet_update_u21(void *descr[], void *_args)
 	 chol_common_codelet_update_u21(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void chol_cublas_codelet_update_u21(void *descr[], void *_args)
 {
 	chol_common_codelet_update_u21(descr, 1, _args);
@@ -168,7 +168,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			for (z = 0; z < nx; z++)
 			{
@@ -205,9 +205,9 @@ void chol_cpu_codelet_update_u11(void *descr[], void *_args)
 	chol_common_codelet_update_u11(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void chol_cublas_codelet_update_u11(void *descr[], void *_args)
 {
 	chol_common_codelet_update_u11(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA

+ 6 - 6
examples/cholesky/dw_cholesky_no_stride.c

@@ -49,10 +49,10 @@ static starpu_codelet cl11 =
 {
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 	.cpu_func = chol_cpu_codelet_update_u11,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u11,
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_POTRF
 	.gordon_func = SPU_FUNC_POTRF,
 #else
@@ -96,10 +96,10 @@ static starpu_codelet cl21 =
 {
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 	.cpu_func = chol_cpu_codelet_update_u21,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u21,
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_STRSM
 	.gordon_func = SPU_FUNC_STRSM,
 #else
@@ -141,10 +141,10 @@ static starpu_codelet cl22 =
 {
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 	.cpu_func = chol_cpu_codelet_update_u22,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = chol_cublas_codelet_update_u22,
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_SGEMM
 	.gordon_func = SPU_FUNC_SGEMM,
 #else

+ 3 - 3
examples/common/blas.c

@@ -22,11 +22,11 @@
 
 /*
     This files contains BLAS wrappers for the different BLAS implementations
-  (eg. REFBLAS, ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
+  (eg. REFBLAS, STARPU_ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
   libraries do not supply C-based ordering.
  */
 
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 
 inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
 			float alpha, float *A, int lda, float *B, int ldb, 
@@ -232,7 +232,7 @@ void DSWAP(const int n, double *x, const int incx, double *y, const int incy)
 	cblas_dswap(n, x, incx, y, incy);
 }
 
-#elif defined(GOTO) || defined(SYSTEM_BLAS)
+#elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
 
 inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
 			float alpha, float *A, int lda, float *B, int ldb, 

+ 2 - 2
examples/common/blas.h

@@ -19,7 +19,7 @@
 
 #include <starpu.h>
 
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 #include <cblas.h>
 #endif
 
@@ -76,7 +76,7 @@ float SDOT(const int n, const float *x, const int incx, const float *y, const in
 void SSWAP(const int n, float *x, const int incx, float *y, const int incy);
 void DSWAP(const int n, double *x, const int incx, double *y, const int incy);
 
-#if defined(GOTO) || defined(SYSTEM_BLAS)
+#if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
 
 extern void sgemm_ (const char *transa, const char *transb, const int *m,
                    const int *n, const int *k, const float *alpha, 

+ 2 - 2
examples/common/blas_model.h

@@ -24,9 +24,9 @@ double gemm_cost(starpu_buffer_descr *descr);
 static struct starpu_perfmodel_t sgemm_model = {
 	.cost_model = gemm_cost,
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = "sgemm_atlas"
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = "sgemm_goto"
 #else
 	.symbol = "sgemm"

+ 1 - 1
examples/gordon/null.h

@@ -16,7 +16,7 @@
 
 #include <starpu.h>
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 static inline unsigned load_gordon_null_kernel(void)
 {
 	unsigned elf_id =

+ 4 - 4
examples/heat/dw_factolu.c

@@ -34,7 +34,7 @@ static starpu_codelet cl11 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u11,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u11,
 #endif
 	.nbuffers = 1,
@@ -45,7 +45,7 @@ static starpu_codelet cl12 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u12,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u12,
 #endif
 	.nbuffers = 2,
@@ -56,7 +56,7 @@ static starpu_codelet cl21 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u21,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u21,
 #endif
 	.nbuffers = 2,
@@ -67,7 +67,7 @@ static starpu_codelet cl22 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u22,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u22,
 #endif
 	.nbuffers = 3,

+ 3 - 3
examples/heat/dw_factolu.h

@@ -21,9 +21,9 @@
 #include <string.h>
 #include <math.h>
 #include <sys/time.h>
-/* for USE_CUDA */
+/* for STARPU_USE_CUDA */
 #include <starpu_config.h>
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas.h>
@@ -189,7 +189,7 @@ void dw_cpu_codelet_update_u12(void **, void *);
 void dw_cpu_codelet_update_u21(void **, void *);
 void dw_cpu_codelet_update_u22(void **, void *);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void dw_cublas_codelet_update_u11(void *descr[], void *_args);
 void dw_cublas_codelet_update_u12(void *descr[], void *_args);
 void dw_cublas_codelet_update_u21(void *descr[], void *_args);

+ 4 - 4
examples/heat/dw_factolu_grain.c

@@ -43,7 +43,7 @@ static struct starpu_task *create_task(starpu_tag_t id)
 static starpu_codelet cl11 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u11,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u11,
 #endif
 	.nbuffers = 1,
@@ -76,7 +76,7 @@ static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k,
 static starpu_codelet cl12 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u12,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u12,
 #endif
 	.nbuffers = 2,
@@ -115,7 +115,7 @@ static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i, uns
 static starpu_codelet cl21 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u21,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u21,
 #endif
 	.nbuffers = 2,
@@ -152,7 +152,7 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, uns
 static starpu_codelet cl22 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u22,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u22,
 #endif
 	.nbuffers = 3,

+ 14 - 14
examples/heat/dw_factolu_kernels.c

@@ -116,7 +116,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, __attr
 	unsigned ld21 = STARPU_GET_BLAS_LD(descr[1]);
 	unsigned ld22 = STARPU_GET_BLAS_LD(descr[2]);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 #endif
 
@@ -127,7 +127,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, __attr
 					     1.0f, center, ld22);
 			break;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
 					right, ld12, 1.0f, center, ld22);
@@ -153,7 +153,7 @@ void dw_cpu_codelet_update_u22(void *descr[], void *_args)
 	count_22_per_worker[id]++;
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void dw_cublas_codelet_update_u22(void *descr[], void *_args)
 {
 	dw_common_cpu_codelet_update_u22(descr, 1, _args);
@@ -161,7 +161,7 @@ void dw_cublas_codelet_update_u22(void *descr[], void *_args)
 	int id = starpu_get_worker_id();
 	count_22_per_worker[id]++;
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA
 
 /*
  * U12
@@ -180,7 +180,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, __attribut
 	unsigned nx12 = STARPU_GET_BLAS_NX(descr[1]);
 	unsigned ny12 = STARPU_GET_BLAS_NY(descr[1]);
 	
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 #endif
 
@@ -190,7 +190,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, __attribut
 			STRSM("L", "L", "N", "N",
 					 nx12, ny12, 1.0f, sub11, ld11, sub12, ld12);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
 					1.0f, sub11, ld11, sub12, ld12);
@@ -216,7 +216,7 @@ void dw_cpu_codelet_update_u12(void *descr[], void *_args)
 	count_12_per_worker[id]++;
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void dw_cublas_codelet_update_u12(void *descr[], void *_args)
 {
 	 dw_common_codelet_update_u12(descr, 1, _args);
@@ -224,7 +224,7 @@ void dw_cublas_codelet_update_u12(void *descr[], void *_args)
 	int id = starpu_get_worker_id();
 	count_12_per_worker[id]++;
 }
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 /* 
  * U21
@@ -243,7 +243,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, __attribut
 	unsigned nx21 = STARPU_GET_BLAS_NX(descr[1]);
 	unsigned ny21 = STARPU_GET_BLAS_NY(descr[1]);
 	
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 #endif
 
@@ -251,7 +251,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, __attribut
 		case 0:
 			STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
 			status = cublasGetError();
@@ -276,7 +276,7 @@ void dw_cpu_codelet_update_u21(void *descr[], void *_args)
 	count_21_per_worker[id]++;
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void dw_cublas_codelet_update_u21(void *descr[], void *_args)
 {
 	dw_common_codelet_update_u21(descr, 1, _args);
@@ -334,7 +334,7 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 						&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			for (z = 0; z < nx; z++)
 			{
@@ -371,7 +371,7 @@ void dw_cpu_codelet_update_u11(void *descr[], void *_args)
 	count_11_per_worker[id]++;
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void dw_cublas_codelet_update_u11(void *descr[], void *_args)
 {
 	dw_common_codelet_update_u11(descr, 1, _args);
@@ -379,4 +379,4 @@ void dw_cublas_codelet_update_u11(void *descr[], void *_args)
 	int id = starpu_get_worker_id();
 	count_11_per_worker[id]++;
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA

+ 4 - 4
examples/heat/dw_factolu_tag.c

@@ -45,7 +45,7 @@ static struct starpu_task *create_task(starpu_tag_t id)
 static starpu_codelet cl11 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u11,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u11,
 #endif
 	.nbuffers = 1,
@@ -79,7 +79,7 @@ static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 static starpu_codelet cl12 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u12,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u12,
 #endif
 	.nbuffers = 2,
@@ -118,7 +118,7 @@ static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
 static starpu_codelet cl21 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u21,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u21,
 #endif
 	.nbuffers = 2,
@@ -155,7 +155,7 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 static starpu_codelet cl22 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dw_cpu_codelet_update_u22,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dw_cublas_codelet_update_u22,
 #endif
 	.nbuffers = 3,

+ 6 - 6
examples/heat/dw_sparse_cg.c

@@ -160,7 +160,7 @@ void init_cg(struct cg_problem *problem)
 	/* delta_new = trans(r) r */
 	struct starpu_task *task3 = create_task(3UL);
 	task3->cl->where = STARPU_CUDA|STARPU_CPU;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	task3->cl->cuda_func = cublas_codelet_func_3;
 #endif
 	task3->cl->cpu_func = cpu_codelet_func_3;
@@ -207,7 +207,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	/* alpha = delta_new / ( trans(d) q )*/
 	struct starpu_task *task5 = create_task(maskiter | 5UL);
 	task5->cl->where = STARPU_CUDA|STARPU_CPU;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	task5->cl->cuda_func = cublas_codelet_func_5;
 #endif
 	task5->cl->cpu_func = cpu_codelet_func_5;
@@ -223,7 +223,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	/* x = x + alpha d */
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
 	task6->cl->where = STARPU_CUDA|STARPU_CPU;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	task6->cl->cuda_func = cublas_codelet_func_6;
 #endif
 	task6->cl->cpu_func = cpu_codelet_func_6;
@@ -239,7 +239,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	/* r = r - alpha q */
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
 	task7->cl->where = STARPU_CUDA|STARPU_CPU;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	task7->cl->cuda_func = cublas_codelet_func_7;
 #endif
 	task7->cl->cpu_func = cpu_codelet_func_7;
@@ -255,7 +255,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	/* update delta_* and compute beta */
 	struct starpu_task *task8 = create_task(maskiter | 8UL);
 	task8->cl->where = STARPU_CUDA|STARPU_CPU;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	task8->cl->cuda_func = cublas_codelet_func_8;
 #endif
 	task8->cl->cpu_func = cpu_codelet_func_8;
@@ -269,7 +269,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 	/* d = r + beta d */
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
 	task9->cl->where = STARPU_CUDA|STARPU_CPU;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	task9->cl->cuda_func = cublas_codelet_func_9;
 #endif
 	task9->cl->cpu_func = cpu_codelet_func_9;

+ 1 - 1
examples/heat/dw_sparse_cg.h

@@ -30,7 +30,7 @@
 #include <starpu_config.h>
 #include <starpu.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cublas.h>
 #endif
 

+ 6 - 6
examples/heat/dw_sparse_cg_kernels.c

@@ -135,7 +135,7 @@ void cpu_codelet_func_3(void *descr[], void *arg)
 	pb->delta_0 = dot;
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_codelet_func_3(void *descr[], void *arg)
 {
 	struct cg_problem *pb = arg;
@@ -226,7 +226,7 @@ void cpu_codelet_func_5(void *descr[], void *arg)
 	pb->alpha = pb->delta_new / dot;
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_codelet_func_5(void *descr[], void *arg)
 {
 	float dot;
@@ -271,7 +271,7 @@ void cpu_codelet_func_6(void *descr[], void *arg)
 	SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_codelet_func_6(void *descr[], void *arg)
 {
 	struct cg_problem *pb = arg;
@@ -310,7 +310,7 @@ void cpu_codelet_func_7(void *descr[], void *arg)
 	SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_codelet_func_7(void *descr[], void *arg)
 {
 	struct cg_problem *pb = arg;
@@ -354,7 +354,7 @@ void cpu_codelet_func_8(void *descr[], void *arg)
 	pb->beta = pb->delta_new/pb->delta_old;
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_codelet_func_8(void *descr[], void *arg)
 {
 	float dot;
@@ -401,7 +401,7 @@ void cpu_codelet_func_9(void *descr[], void *arg)
 	SAXPY (size, 1.0f, vecr, 1, vecd, 1);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_codelet_func_9(void *descr[], void *arg)
 {
 	struct cg_problem *pb = arg;

+ 8 - 8
examples/heat/lu_kernels_model.c

@@ -219,9 +219,9 @@ struct starpu_perfmodel_t model_11 = {
 		[STARPU_CUDA_DEFAULT] = { .cost_model = task_11_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = "lu_model_11_atlas"
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = "lu_model_11_goto"
 #else
 	.symbol = "lu_model_11"
@@ -235,9 +235,9 @@ struct starpu_perfmodel_t model_12 = {
 		[STARPU_CUDA_DEFAULT] = { .cost_model = task_12_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = "lu_model_12_atlas"
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = "lu_model_12_goto"
 #else
 	.symbol = "lu_model_12"
@@ -251,9 +251,9 @@ struct starpu_perfmodel_t model_21 = {
 		[STARPU_CUDA_DEFAULT] = { .cost_model = task_21_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = "lu_model_21_atlas"
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = "lu_model_21_goto"
 #else
 	.symbol = "lu_model_21"
@@ -267,9 +267,9 @@ struct starpu_perfmodel_t model_22 = {
 		[STARPU_CUDA_DEFAULT] = { .cost_model = task_22_cost_cuda }
 	},
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = "lu_model_22_atlas"
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = "lu_model_22_goto"
 #else
 	.symbol = "lu_model_22"

+ 2 - 2
examples/incrementer/incrementer.c

@@ -19,7 +19,7 @@
 
 static unsigned niter = 50000;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
 
@@ -50,7 +50,7 @@ int main(int argc, char **argv)
 		/* CUBLAS stands for CUDA kernels controlled from the host */
 		.where = STARPU_CPU|STARPU_CUDA,
 		.cpu_func = cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		.cuda_func = cuda_codelet,
 #endif
 		.nbuffers = 1

+ 12 - 12
examples/lu/xlu.c

@@ -48,9 +48,9 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
 static struct starpu_perfmodel_t STARPU_LU(model_11) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_11_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_11_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_11)
@@ -60,7 +60,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_11) = {
 static starpu_codelet cl11 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_u11),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_u11),
 #endif
 	.nbuffers = 1,
@@ -93,9 +93,9 @@ static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 
 static struct starpu_perfmodel_t STARPU_LU(model_12) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_12_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_12_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_12)
@@ -105,7 +105,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_12) = {
 static starpu_codelet cl12 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_u12),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_u12),
 #endif
 	.nbuffers = 2,
@@ -143,9 +143,9 @@ static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned j)
 
 static struct starpu_perfmodel_t STARPU_LU(model_21) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_21_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_21_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_21)
@@ -155,7 +155,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_21) = {
 static starpu_codelet cl21 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_u21),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_u21),
 #endif
 	.nbuffers = 2,
@@ -191,9 +191,9 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned i)
 
 static struct starpu_perfmodel_t STARPU_LU(model_22) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_22_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_22_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_22)
@@ -203,7 +203,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_22) = {
 static starpu_codelet cl22 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_u22),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_u22),
 #endif
 	.nbuffers = 3,

+ 2 - 2
examples/lu/xlu.h

@@ -17,7 +17,7 @@
 #ifndef __XLU_H__
 #define __XLU_H__
 
-/* for USE_CUDA */
+/* for STARPU_USE_CUDA */
 #include <starpu_config.h>
 #include <starpu.h>
 
@@ -79,7 +79,7 @@ void dw_cpu_codelet_update_u12(void **, void *);
 void dw_cpu_codelet_update_u21(void **, void *);
 void dw_cpu_codelet_update_u22(void **, void *);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void dw_cublas_codelet_update_u11(void *descr[], void *_args);
 void dw_cublas_codelet_update_u12(void *descr[], void *_args);
 void dw_cublas_codelet_update_u21(void *descr[], void *_args);

+ 20 - 20
examples/lu/xlu_kernels.c

@@ -36,7 +36,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 	unsigned ld21 = STARPU_GET_BLAS_LD(descr[1]);
 	unsigned ld22 = STARPU_GET_BLAS_LD(descr[2]);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 	cudaError_t cures;
 #endif
@@ -48,7 +48,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 				(TYPE)1.0, center, ld22);
 			break;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
 				(TYPE)-1.0, right, ld21, left, ld12,
@@ -74,12 +74,12 @@ void STARPU_LU(cpu_u22)(void *descr[], void *_args)
 	STARPU_LU(common_u22)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void STARPU_LU(cublas_u22)(void *descr[], void *_args)
 {
 	STARPU_LU(common_u22)(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA
 
 /*
  * U12
@@ -100,7 +100,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 	unsigned nx12 = STARPU_GET_BLAS_NX(descr[1]);
 	unsigned ny12 = STARPU_GET_BLAS_NY(descr[1]);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 	cudaError_t cures;
 #endif
@@ -111,7 +111,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
 					(TYPE)1.0, sub11, ld11, sub12, ld12);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
 					(TYPE)1.0, sub11, ld11, sub12, ld12);
@@ -136,12 +136,12 @@ void STARPU_LU(cpu_u12)(void *descr[], void *_args)
 	STARPU_LU(common_u12)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void STARPU_LU(cublas_u12)(void *descr[], void *_args)
 {
 	STARPU_LU(common_u12)(descr, 1, _args);
 }
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 /* 
  * U21
@@ -162,7 +162,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 	unsigned nx21 = STARPU_GET_BLAS_NX(descr[1]);
 	unsigned ny21 = STARPU_GET_BLAS_NY(descr[1]);
 	
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 	cudaError_t cures;
 #endif
@@ -172,7 +172,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
 					(TYPE)1.0, sub11, ld11, sub21, ld21);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
 					(TYPE)1.0, sub11, ld11, sub21, ld21);
@@ -196,7 +196,7 @@ void STARPU_LU(cpu_u21)(void *descr[], void *_args)
 	STARPU_LU(common_u21)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void STARPU_LU(cublas_u21)(void *descr[], void *_args)
 {
 	STARPU_LU(common_u21)(descr, 1, _args);
@@ -235,7 +235,7 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 						&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			for (z = 0; z < nx; z++)
 			{
@@ -268,12 +268,12 @@ void STARPU_LU(cpu_u11)(void *descr[], void *_args)
 	STARPU_LU(common_u11)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void STARPU_LU(cublas_u11)(void *descr[], void *_args)
 {
 	STARPU_LU(common_u11)(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA
 
 /*
  *	U11 with pivoting
@@ -332,7 +332,7 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 			}
 
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			for (z = 0; z < nx; z++)
 			{
@@ -383,12 +383,12 @@ void STARPU_LU(cpu_u11_pivot)(void *descr[], void *_args)
 	STARPU_LU(common_u11_pivot)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void STARPU_LU(cublas_u11_pivot)(void *descr[], void *_args)
 {
 	STARPU_LU(common_u11_pivot)(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA
 
 /*
  *	Pivoting
@@ -421,7 +421,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 				}
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			for (row = 0; row < nx; row++)
 			{
@@ -447,11 +447,11 @@ void STARPU_LU(cpu_pivot)(void *descr[], void *_args)
 	STARPU_LU(common_pivot)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void STARPU_LU(cublas_pivot)(void *descr[], void *_args)
 {
 	STARPU_LU(common_pivot)(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA
 
 

+ 1 - 1
examples/lu/xlu_kernels.h

@@ -30,7 +30,7 @@ void STARPU_LU(cpu_u12)(void *descr[], void *_args);
 void STARPU_LU(cpu_u21)(void *descr[], void *_args);
 void STARPU_LU(cpu_u22)(void *descr[], void *_args);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void STARPU_LU(cublas_pivot)(void *descr[], void *_args);
 void STARPU_LU(cublas_u11_pivot)(void *descr[], void *_args);
 void STARPU_LU(cublas_u11)(void *descr[], void *_args);

+ 15 - 15
examples/lu/xlu_pivot.c

@@ -51,9 +51,9 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
 static struct starpu_perfmodel_t STARPU_LU(model_pivot) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_pivot_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_pivot_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_pivot)
@@ -63,7 +63,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_pivot) = {
 static starpu_codelet cl_pivot = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_pivot),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_pivot),
 #endif
 	.nbuffers = 1,
@@ -118,9 +118,9 @@ static void create_task_pivot(starpu_data_handle *dataAp, unsigned nblocks,
 
 static struct starpu_perfmodel_t STARPU_LU(model_11_pivot) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_11_pivot_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_11_pivot_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_11_pivot)
@@ -130,7 +130,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_11_pivot) = {
 static starpu_codelet cl11_pivot = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_u11_pivot),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_u11_pivot),
 #endif
 	.nbuffers = 1,
@@ -165,9 +165,9 @@ static struct starpu_task *create_task_11_pivot(starpu_data_handle *dataAp, unsi
 
 static struct starpu_perfmodel_t STARPU_LU(model_12) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_12_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_12_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_12)
@@ -177,7 +177,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_12) = {
 static starpu_codelet cl12 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_u12),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_u12),
 #endif
 	.nbuffers = 2,
@@ -221,9 +221,9 @@ static void create_task_12(starpu_data_handle *dataAp, unsigned nblocks, unsigne
 
 static struct starpu_perfmodel_t STARPU_LU(model_21) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_21_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_21_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_21)
@@ -233,7 +233,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_21) = {
 static starpu_codelet cl21 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_u21),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_u21),
 #endif
 	.nbuffers = 2,
@@ -275,9 +275,9 @@ static void create_task_21(starpu_data_handle *dataAp, unsigned nblocks, unsigne
 
 static struct starpu_perfmodel_t STARPU_LU(model_22) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_LU_STR(lu_model_22_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_LU_STR(lu_model_22_goto)
 #else
 	.symbol = STARPU_LU_STR(lu_model_22)
@@ -287,7 +287,7 @@ static struct starpu_perfmodel_t STARPU_LU(model_22) = {
 static starpu_codelet cl22 = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_LU(cpu_u22),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_LU(cublas_u22),
 #endif
 	.nbuffers = 3,

+ 3 - 3
examples/mult/dw_mult.c

@@ -89,7 +89,7 @@ static void init_problem_data(void)
 {
 	unsigned i,j;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	if (pin) {
 		starpu_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(float));
 		starpu_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(float));
@@ -183,10 +183,10 @@ static void partition_mult_data(void)
 static starpu_codelet cl = {
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 	.cpu_func = cpu_mult,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = cublas_mult,
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_SGEMM
 	.gordon_func = SPU_FUNC_SGEMM,
 #else

+ 2 - 2
examples/mult/dw_mult.h

@@ -29,7 +29,7 @@
 
 #include <starpu.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cublas.h>
 #endif
@@ -193,7 +193,7 @@ static void display_memory_consumption(void)
 		+ ydim*xdim*sizeof(float))/(1024*1024) );
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_mult(void *descr[], __attribute__((unused)) void *arg);
 #endif
 

+ 8 - 8
examples/mult/dw_mult_no_stride.c

@@ -15,7 +15,7 @@
  */
 
 #include "dw_mult.h"
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #include "gordon/func_sgemm_ibm.h"
 #endif
 
@@ -178,7 +178,7 @@ static void init_problem_data(void)
 		}
 	}
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	conf.k = BLOCKSIZEZ;
 	conf.m = BLOCKSIZEY;
 	conf.n = BLOCKSIZEX;
@@ -228,10 +228,10 @@ struct cb2_s {
 
 static starpu_codelet cl = {
 	.cpu_func = cpu_mult,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = cublas_mult,
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	/* .gordon_func will be set by load_elf_sgemm */
 #endif
 
@@ -240,7 +240,7 @@ static starpu_codelet cl = {
 	.nbuffers = 3
 };
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
 static unsigned spu_func_sgemm_elf_id;
 static unsigned spu_func_sgemm_ibm_id;
@@ -259,7 +259,7 @@ static void load_elf_sgemm(void)
 	cl.gordon_func = spu_func_sgemm_ibm_id;
 }
 
-#endif // USE_GORDON
+#endif // STARPU_USE_GORDON
 
 static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
 {
@@ -267,7 +267,7 @@ static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, un
 
 	task->cl = &cl;
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	task->cl_arg = &conf;
 	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
 #endif
@@ -375,7 +375,7 @@ int main(__attribute__ ((unused)) int argc,
 
 	starpu_helper_init_cublas();
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	load_elf_sgemm();
 #endif
 

+ 7 - 7
examples/mult/dw_mult_no_stride_no_tag.c

@@ -15,7 +15,7 @@
  */
 
 #include "dw_mult.h"
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #include "gordon/func_sgemm_ibm.h"
 #endif
 
@@ -194,7 +194,7 @@ static void init_problem_data(void)
 		}
 	}
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	conf.k = BLOCKSIZEZ;
 	conf.m = BLOCKSIZEY;
 	conf.n = BLOCKSIZEX;
@@ -282,17 +282,17 @@ struct cb2_s {
 static starpu_codelet cl = {
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 	.cpu_func = cpu_mult,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = cublas_mult,
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	/* .gordon_func will be set by load_elf_sgemm */
 #endif
 	.nbuffers = 3
 };
 
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
 static unsigned spu_func_sgemm_elf_id;
 static unsigned spu_func_sgemm_ibm_id;
@@ -329,7 +329,7 @@ static void construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, st
 	task->callback_func = callback_func_3;
 	task->callback_arg = posp;
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	task->cl_arg = &conf;
 	task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
 #endif
@@ -403,7 +403,7 @@ int main(__attribute__ ((unused)) int argc,
 
 	starpu_helper_init_cublas();
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	load_elf_sgemm();
 #endif
 

+ 1 - 1
examples/mult/sgemm_kernels.c

@@ -39,7 +39,7 @@
 
 
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_mult(void *descr[], __attribute__((unused)) void *arg)
 {
 	COMMON_CODE

+ 4 - 4
examples/mult/xgemm.c

@@ -74,7 +74,7 @@ static void init_problem_data(void)
 {
 	unsigned i,j;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	if (pin) {
 		starpu_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
 		starpu_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
@@ -173,9 +173,9 @@ static void unpartition_mult_data(void)
 
 static struct starpu_perfmodel_t gemm_model = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_GEMM_STR(gemm_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_GEMM_STR(gemm_goto)
 #else
 	.symbol = STARPU_GEMM_STR(gemm)
@@ -185,7 +185,7 @@ static struct starpu_perfmodel_t gemm_model = {
 static starpu_codelet cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_GEMM(cpu_mult),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_GEMM(cublas_mult),
 #endif
 	.model = &gemm_model,

+ 1 - 1
examples/mult/xgemm_kernels.c

@@ -39,7 +39,7 @@
 
 
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void STARPU_GEMM(cublas_mult)(void *descr[], __attribute__((unused)) void *arg)
 {
 	COMMON_CODE

+ 1 - 1
examples/pastix-wrappers/starpu-blas-wrapper.c

@@ -32,7 +32,7 @@
 
 #include <starpu.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #endif
 

+ 1 - 1
examples/spmv/dw_block_spmv.c

@@ -117,7 +117,7 @@ unsigned totaltasks;
 starpu_codelet cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func =  cpu_block_spmv,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = cublas_block_spmv,
 #endif
 	.nbuffers = 3

+ 3 - 3
examples/spmv/dw_block_spmv.h

@@ -28,14 +28,14 @@
 
 #include <starpu.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cublas.h>
 #endif
 
 void cpu_block_spmv(void *descr[], void *_args);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_block_spmv(void *descr[], void *_args);
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 #endif // __DW_BLOCK_SPMV_H__

+ 3 - 3
examples/spmv/dw_block_spmv_kernels.c

@@ -36,7 +36,7 @@ static inline void common_block_spmv(void *descr[], int s, __attribute__((unused
 		case 0:
 			cblas_sgemv(CblasRowMajor, CblasNoTrans, dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
 			break;
@@ -54,11 +54,11 @@ void cpu_block_spmv(void *descr[], void *_args)
 	common_block_spmv(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cublas_block_spmv(void *descr[], void *_args)
 {
 //	printf("CUBLAS CODELET \n");
 
 	common_block_spmv(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA

+ 3 - 3
examples/spmv/dw_spmv.c

@@ -26,7 +26,7 @@ struct timeval end;
 unsigned nblocks = 1;
 unsigned remainingtasks = -1;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 
 extern void spmv_kernel_cpu_wrapper(uint32_t nnz, uint32_t nrow, float *nzval,
 			uint32_t *colind, uint32_t *rowptr, uint32_t firstentry,
@@ -51,7 +51,7 @@ void spmv_kernel_cuda(void *descr[], void *args)
 	spmv_kernel_cpu_wrapper(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
 }
 
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 
 sem_t sem;
@@ -251,7 +251,7 @@ void call_spmv_codelet_filters(void)
 
 	cl->where = STARPU_CPU|STARPU_CUDA;
 	cl->cpu_func =  cpu_spmv;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cl->cuda_func = spmv_kernel_cuda;
 #endif
 	cl->nbuffers = 3;

+ 1 - 1
examples/starpufft/Makefile.am

@@ -37,7 +37,7 @@ libstarpufft_la_SOURCES = starpufft.c starpufftf.c starpufft-common.c
 libstarpufft_la_LIBADD = $(top_builddir)/src/libstarpu.la $(FFTW_LIBS) $(FFTWF_LIBS)
 libstarpufft_la_CFLAGS = $(FFTWF_CFLAGS)
 
-if USE_CUDA
+if STARPU_USE_CUDA
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
 NVCCFLAGS += -Xcompiler -fPIC -Xlinker -fPIC

+ 2 - 2
examples/starpufft/double.h

@@ -21,7 +21,7 @@
 #include <fftw3.h>
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cufft.h>
 #endif
 
@@ -30,7 +30,7 @@ typedef double real;
 typedef fftw_complex _fftw_complex;
 typedef fftw_plan _fftw_plan;
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 typedef cuDoubleComplex _cuComplex;
 typedef cufftDoubleComplex _cufftComplex;
 #define _cufftExecC2C cufftExecZ2Z

+ 2 - 2
examples/starpufft/float.h

@@ -21,7 +21,7 @@
 #include <fftw3.h>
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cufft.h>
 #endif
 
@@ -30,7 +30,7 @@ typedef float real;
 typedef fftwf_complex _fftw_complex;
 typedef fftwf_plan _fftw_plan;
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 typedef cuComplex _cuComplex;
 typedef cufftComplex _cufftComplex;
 #define _cufftExecC2C cufftExecC2C

+ 7 - 7
examples/starpufft/starpufftx.c

@@ -23,7 +23,7 @@
 #include <config.h>
 
 #include "starpufft.h"
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #define _externC extern
 #include "cudax_kernels.h"
 #endif
@@ -79,7 +79,7 @@ struct STARPUFFT(plan) {
 	starpu_data_handle roots_handle[2];
 
 	struct {
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		cufftHandle plan1_cuda, plan2_cuda;
 		int initialized1, initialized2;
 		cudaStream_t stream;
@@ -109,7 +109,7 @@ struct STARPUFFT(args) {
 	int i, j, jj, kk, ll, *iv, *kkv;
 };
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 cudaStream_t
 STARPUFFT(get_local_stream)(STARPUFFT(plan) plan, int workerid)
 {
@@ -148,7 +148,7 @@ compute_roots(STARPUFFT(plan) plan)
 			plan->roots[dim][k] = cexp(exp*k);
 		starpu_register_vector_data(&plan->roots_handle[dim], 0, (uintptr_t) plan->roots[dim], plan->n[dim], sizeof(**plan->roots));
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		if (plan->n[dim] > 100000) {
 			/* prefetch the big root array on GPUs */
 			unsigned worker;
@@ -245,7 +245,7 @@ STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
 #endif
 			break;
 		case STARPU_CUDA_WORKER:
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 			/* FIXME: Can't deallocate */
 #endif
 			break;
@@ -319,7 +319,7 @@ STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
 void *
 STARPUFFT(malloc)(size_t n)
 {
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	void *res;
 	starpu_malloc_pinned_if_possible(&res, n);
 	return res;
@@ -335,7 +335,7 @@ STARPUFFT(malloc)(size_t n)
 void
 STARPUFFT(free)(void *p)
 {
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	// TODO: FIXME
 #else
 #  ifdef HAVE_FFTW

+ 9 - 9
examples/starpufft/starpufftx1d.c

@@ -18,7 +18,7 @@
 
 #define STEP_TAG_1D(plan, step, i) _STEP_TAG(plan, step, i)
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 /* Twist the full vector into a n2 chunk */
 static void
 STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
@@ -258,11 +258,11 @@ static struct starpu_perfmodel_t STARPUFFT(twist3_1d_model) = {
 
 static starpu_codelet STARPUFFT(twist1_1d_codelet) = {
 	.where =
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 		STARPU_CPU,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPUFFT(twist1_1d_kernel_gpu),
 #endif
 	.cpu_func = STARPUFFT(twist1_1d_kernel_cpu),
@@ -272,14 +272,14 @@ static starpu_codelet STARPUFFT(twist1_1d_codelet) = {
 
 static starpu_codelet STARPUFFT(fft1_1d_codelet) = {
 	.where =
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPUFFT(fft1_1d_kernel_gpu),
 #endif
 #ifdef HAVE_FFTW
@@ -298,14 +298,14 @@ static starpu_codelet STARPUFFT(twist2_1d_codelet) = {
 
 static starpu_codelet STARPUFFT(fft2_1d_codelet) = {
 	.where =
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPUFFT(fft2_1d_kernel_gpu),
 #endif
 #ifdef HAVE_FFTW
@@ -343,7 +343,7 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 	 * - twist3: twist back into output
 	 */
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	/* cufft 1D limited to 8M elements */
 	while (n2 > 8 << 20) {
 		n1 *= 2;
@@ -417,7 +417,7 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 #endif
 			break;
 		case STARPU_CUDA_WORKER:
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 			plan->plans[workerid].initialized1 = 0;
 			plan->plans[workerid].initialized2 = 0;
 #endif

+ 10 - 10
examples/starpufft/starpufftx2d.c

@@ -22,7 +22,7 @@
 
 #define STEP_TAG_2D(plan, step, i, j) _STEP_TAG(plan, step, ((starpu_tag_t) i << I_SHIFT) | (starpu_tag_t) j)
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 /* Twist the full vector into a n2,m2 chunk */
 static void
 STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
@@ -298,11 +298,11 @@ struct starpu_perfmodel_t STARPUFFT(twist3_2d_model) = {
 
 static starpu_codelet STARPUFFT(twist1_2d_codelet) = {
 	.where =
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 		STARPU_CPU,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPUFFT(twist1_2d_kernel_gpu),
 #endif
 	.cpu_func = STARPUFFT(twist1_2d_kernel_cpu),
@@ -312,14 +312,14 @@ static starpu_codelet STARPUFFT(twist1_2d_codelet) = {
 
 static starpu_codelet STARPUFFT(fft1_2d_codelet) = {
 	.where =
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPUFFT(fft1_2d_kernel_gpu),
 #endif
 #ifdef HAVE_FFTW
@@ -338,14 +338,14 @@ static starpu_codelet STARPUFFT(twist2_2d_codelet) = {
 
 static starpu_codelet STARPUFFT(fft2_2d_codelet) = {
 	.where =
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPUFFT(fft2_2d_kernel_gpu),
 #endif
 #ifdef HAVE_FFTW
@@ -386,7 +386,7 @@ STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
 	 * - twist3: twist back into output
 	 */
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	/* cufft 2D-3D limited to [2,16384] */
 	while (n2 > 16384) {
 		n1 *= 2;
@@ -397,7 +397,7 @@ STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
 	STARPU_ASSERT(n1 < (1ULL << J_BITS));
 
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	/* cufft 2D-3D limited to [2,16384] */
 	while (m2 > 16384) {
 		m1 *= 2;
@@ -476,7 +476,7 @@ STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
 #endif
 			break;
 		case STARPU_CUDA_WORKER:
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 			plan->plans[workerid].initialized1 = 0;
 			plan->plans[workerid].initialized2 = 0;
 #endif

+ 9 - 9
examples/starpufft/testx.c

@@ -26,12 +26,12 @@
 #include <starpu_config.h>
 #include "starpufft.h"
 
-#undef USE_CUDA
+#undef STARPU_USE_CUDA
 
 #ifdef HAVE_FFTW
 #include <fftw3.h>
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cufft.h>
 #endif
 
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
 #ifdef HAVE_FFTW
 	_FFTW(plan) fftw_plan;
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cufftHandle cuda_plan;
 	cudaError_t cures;
 #endif
@@ -89,7 +89,7 @@ int main(int argc, char *argv[]) {
 	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda));
 #endif
 
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
 #ifdef HAVE_FFTW
 		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
 			printf("erf\n");
 #endif
@@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
 #ifdef HAVE_FFTW
 		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
 #endif
 	} else {
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
 	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
 	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	gettimeofday(&begin, NULL);
 	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
 		printf("erf2\n");
@@ -185,7 +185,7 @@ int main(int argc, char *argv[]) {
 }
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 {
 	double max = 0., tot = 0., norm = 0., normdiff = 0.;
 	for (i = 0; i < size; i++) {
@@ -220,7 +220,7 @@ int main(int argc, char *argv[]) {
 	STARPUFFT(free)(out_fftw);
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	free(out_cuda);
 #endif
 

+ 5 - 5
examples/strassen/strassen.c

@@ -92,7 +92,7 @@ static starpu_codelet cl_add = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_add_sub,
 	.cpu_func = add_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = add_cublas_codelet,
 #endif
 	.nbuffers = 3
@@ -102,7 +102,7 @@ static starpu_codelet cl_sub = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_add_sub,
 	.cpu_func = sub_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = sub_cublas_codelet,
 #endif
 	.nbuffers = 3
@@ -112,7 +112,7 @@ static starpu_codelet cl_mult = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_mult,
 	.cpu_func = mult_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = mult_cublas_codelet,
 #endif
 	.nbuffers = 3
@@ -122,7 +122,7 @@ static starpu_codelet cl_self_add = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_self_add_sub,
 	.cpu_func = self_add_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = self_add_cublas_codelet,
 #endif
 	.nbuffers = 2
@@ -132,7 +132,7 @@ static starpu_codelet cl_self_sub = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_self_add_sub,
 	.cpu_func = self_sub_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = self_sub_cublas_codelet,
 #endif
 	.nbuffers = 2

+ 2 - 2
examples/strassen/strassen.h

@@ -27,7 +27,7 @@
 #include <cblas.h>
 
 #include <starpu_config.h>
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cublas.h>
 #endif
@@ -97,7 +97,7 @@ void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
 void self_add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
 void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void mult_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
 void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
 void add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);

+ 6 - 6
examples/strassen/strassen_kernels.c

@@ -37,7 +37,7 @@ static void mult_common_codelet(void *descr[], int s, __attribute__((unused))  v
 				dy, dx, dz, -1.0f, left, ld21, right, ld12,
 					     1.0f, center, ld22);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublasSgemm('t', 'n', dx, dy, dz, 
 					-1.0f, right, ld12, left, ld21, 
@@ -56,7 +56,7 @@ void mult_cpu_codelet(void *descr[], void *_args)
 	mult_common_codelet(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void mult_cublas_codelet(void *descr[], void *_args)
 {
 	mult_common_codelet(descr, 1, _args);
@@ -92,7 +92,7 @@ static void add_sub_common_codelet(void *descr[], int s, __attribute__((unused))
 				cblas_saxpy(dx, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			for (line = 0; line < dy; line++)
 			{
@@ -122,7 +122,7 @@ void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
 	add_sub_common_codelet(descr, 0, arg, 1.0f);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
 {
 	add_sub_common_codelet(descr, 1, arg, -1.0f);
@@ -160,7 +160,7 @@ static void self_add_sub_common_codelet(void *descr[], int s, __attribute__((unu
 				cblas_saxpy(dx, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			for (line = 0; line < dy; line++)
 			{
@@ -191,7 +191,7 @@ void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
 	self_add_sub_common_codelet(descr, 0, arg, -1.0f);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void self_add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
 {
 	self_add_sub_common_codelet(descr, 1, arg, 1.0f);

+ 9 - 9
examples/strassen2/strassen2.c

@@ -83,7 +83,7 @@ extern void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
 extern void self_add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
 extern void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 extern void mult_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
 extern void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
 extern void add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
@@ -210,7 +210,7 @@ static starpu_codelet cl_add = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_add,
 	.cpu_func = add_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = add_cublas_codelet,
 #endif
 	.nbuffers = 3
@@ -220,7 +220,7 @@ static starpu_codelet cl_sub = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_sub,
 	.cpu_func = sub_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = sub_cublas_codelet,
 #endif
 	.nbuffers = 3
@@ -230,7 +230,7 @@ static starpu_codelet cl_mult = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_mult,
 	.cpu_func = mult_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = mult_cublas_codelet,
 #endif
 	.nbuffers = 3
@@ -276,7 +276,7 @@ static starpu_codelet cl_self_add = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_self_add,
 	.cpu_func = self_add_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = self_add_cublas_codelet,
 #endif
 	.nbuffers = 2
@@ -286,7 +286,7 @@ static starpu_codelet cl_self_sub = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = &strassen_model_self_sub,
 	.cpu_func = self_sub_cpu_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = self_sub_cublas_codelet,
 #endif
 	.nbuffers = 2
@@ -346,7 +346,7 @@ static starpu_codelet cleanup_codelet = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = NULL,
 	.cpu_func = null_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = null_codelet,
 #endif
 	.nbuffers = 0
@@ -722,7 +722,7 @@ static starpu_codelet dummy_codelet = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.model = NULL,
 	.cpu_func = dummy_codelet_func,
-	#ifdef USE_CUDA
+	#ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet_func,
 	#endif
 	.nbuffers = 0
@@ -787,7 +787,7 @@ int main(int argc, char **argv)
 
 	starpu_helper_init_cublas();
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
         if (pin) {
                 starpu_malloc_pinned_if_possible((void **)&bigbuffer, used_mem_predicted);
         } else

+ 10 - 10
examples/strassen2/strassen2_kernels.c

@@ -24,7 +24,7 @@
 #include <semaphore.h>
 
 #include <starpu_config.h>
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cublas.h>
 #endif
 
@@ -65,7 +65,7 @@ static void mult_common_codelet(void *descr[], int s, __attribute__((unused))  v
 
 	double flop = 2.0*n*n*n;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus cublasres;
 #endif
 
@@ -74,7 +74,7 @@ static void mult_common_codelet(void *descr[], int s, __attribute__((unused))  v
 			cpus_flop += flop;
 			SGEMM("N", "N", n, n, n, 1.0f, right, ld21, left, ld12, 0.0f, center, ld22);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublas_flop += flop;
 
@@ -95,7 +95,7 @@ void mult_cpu_codelet(void *descr[], void *_args)
 	mult_common_codelet(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void mult_cublas_codelet(void *descr[], void *_args)
 {
 	mult_common_codelet(descr, 1, _args);
@@ -121,7 +121,7 @@ static void add_sub_common_codelet(void *descr[], int s, __attribute__((unused))
 	// TODO check dim ...
 
 	unsigned line;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus cublasres;
 #endif
 
@@ -136,7 +136,7 @@ static void add_sub_common_codelet(void *descr[], int s, __attribute__((unused))
 				SAXPY(n, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublas_flop += flop;
 			for (line = 0; line < n; line++)
@@ -171,7 +171,7 @@ void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
 	add_sub_common_codelet(descr, 0, arg, 1.0f);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
 {
 	add_sub_common_codelet(descr, 1, arg, -1.0f);
@@ -202,7 +202,7 @@ static void self_add_sub_common_codelet(void *descr[], int s, __attribute__((unu
 	
 	unsigned line;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus cublasres;
 #endif
 
@@ -215,7 +215,7 @@ static void self_add_sub_common_codelet(void *descr[], int s, __attribute__((unu
 				SAXPY(n, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			cublas_flop += flop;
 			for (line = 0; line < n; line++)
@@ -247,7 +247,7 @@ void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
 	self_add_sub_common_codelet(descr, 0, arg, -1.0f);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void self_add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
 {
 	self_add_sub_common_codelet(descr, 1, arg, 1.0f);

+ 3 - 3
examples/tag_example/tag_example.c

@@ -23,7 +23,7 @@
 
 #include <starpu.h>
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #include <gordon/null.h>
 #endif
 
@@ -190,7 +190,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 {
 	starpu_init(NULL);
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	/* load an empty kernel and get its identifier */
 	unsigned gordon_null_kernel = load_gordon_null_kernel();
 #endif
@@ -202,7 +202,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON;
 	cl.cpu_func = cpu_codelet;
 	cl.cuda_func = cpu_codelet;
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	cl.gordon_func = gordon_null_kernel;
 #endif
 	cl.nbuffers = 0;

+ 3 - 3
examples/tag_example/tag_example2.c

@@ -23,7 +23,7 @@
 
 #include <starpu.h>
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #include <gordon/null.h>
 #endif
 
@@ -111,7 +111,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
 	starpu_init(NULL);
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	/* load an empty kernel and get its identifier */
 	unsigned gordon_null_kernel = load_gordon_null_kernel();
 #endif
@@ -120,7 +120,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
 	cl.cpu_func = cpu_codelet;
 	cl.cuda_func = cpu_codelet;
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	cl.gordon_func = gordon_null_kernel;
 #endif
 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON;

+ 3 - 3
examples/tag_example/tag_example3.c

@@ -23,7 +23,7 @@
 
 #include <starpu.h>
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #include <gordon/null.h>
 #endif
 
@@ -110,7 +110,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
 	starpu_init(NULL);
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	/* load an empty kernel and get its identifier */
 	unsigned gordon_null_kernel = load_gordon_null_kernel();
 #endif
@@ -119,7 +119,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
 	cl.cpu_func = cpu_codelet;
 	cl.cuda_func = cpu_codelet;
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	cl.gordon_func = gordon_null_kernel;
 #endif
 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON;

+ 3 - 3
examples/tag_example/tag_restartable.c

@@ -24,7 +24,7 @@
 
 #include <starpu.h>
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #include <gordon/null.h>
 #endif
 
@@ -116,7 +116,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
 	starpu_init(NULL);
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	/* load an empty kernel and get its identifier */
 	unsigned gordon_null_kernel = load_gordon_null_kernel();
 #endif
@@ -125,7 +125,7 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
 	cl.cpu_func = cpu_codelet;
 	cl.cuda_func = cpu_codelet;
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	cl.gordon_func = gordon_null_kernel;
 #endif
 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON;

+ 1 - 1
include/starpu-task.h

@@ -21,7 +21,7 @@
 #include <starpu_config.h>
 #include <starpu.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #endif
 

+ 4 - 4
include/starpu-util.h

@@ -24,7 +24,7 @@
 #include <starpu_config.h>
 #include <starpu-task.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cublas.h>
@@ -123,7 +123,7 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 #error __sync_synchronize is not available
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 
 #define CUBLAS_REPORT_ERROR(status) 					\
 	do {								\
@@ -251,7 +251,7 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 		assert(0);						\
 	} while (0)  
 
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 static inline int starpu_get_env_number(const char *str)
 {
@@ -294,7 +294,7 @@ void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t whe
 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps,
 				void (*callback)(void *), void *callback_arg);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 cudaStream_t *starpu_get_local_cuda_stream(void);
 #endif
 

+ 6 - 6
include/starpu_config.h.in

@@ -1,13 +1,13 @@
 #ifndef __STARPU_CONFIG_PUBLIC_H__
 #define __STARPU_CONFIG_PUBLIC_H__
 
-#undef USE_CPU
-#undef USE_CUDA
-#undef USE_GORDON
+#undef STARPU_USE_CPU
+#undef STARPU_USE_CUDA
+#undef STARPU_USE_GORDON
 
-#undef ATLAS
-#undef GOTO
-#undef SYSTEM_BLAS
+#undef STARPU_ATLAS
+#undef STARPU_GOTO
+#undef STARPU_SYSTEM_BLAS
 
 #undef STARPUDIR
 

+ 2 - 2
mpi/Makefile.am

@@ -29,7 +29,7 @@ mpiexamplebindir=$(libdir)/starpu/mpi/
 
 examplebin_PROGRAMS =
 
-if USE_CUDA
+if STARPU_USE_CUDA
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
 
@@ -171,7 +171,7 @@ tests_block_interface_pinned_LDADD =			\
 tests_block_interface_pinned_SOURCES =			\
 	tests/block_interface_pinned.c
 
-if USE_CUDA
+if STARPU_USE_CUDA
 tests_ring_SOURCES += tests/ring_kernel.cu
 tests_ring_async_SOURCES += tests/ring_kernel.cu
 endif

+ 1 - 1
mpi/examples/mpi_lu/pxlu.h

@@ -17,7 +17,7 @@
 #ifndef __PXLU_H__
 #define __PXLU_H__
 
-/* for USE_CUDA */
+/* for STARPU_USE_CUDA */
 #include <starpu_config.h>
 #include <starpu.h>
 

+ 26 - 26
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -47,7 +47,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d\n", rank, info->k, info->i, info->j);
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 	cudaError_t cures;
 #endif
@@ -59,7 +59,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 				(TYPE)1.0, center, ld22);
 			break;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
 				(TYPE)-1.0, right, ld21, left, ld12,
@@ -88,18 +88,18 @@ static void STARPU_PLU(cpu_u22)(void *descr[], void *_args)
 	STARPU_PLU(common_u22)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
 {
 	STARPU_PLU(common_u22)(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA
 
 static struct starpu_perfmodel_t STARPU_PLU(model_22) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_PLU_STR(lu_model_22_goto)
 #else
 	.symbol = STARPU_PLU_STR(lu_model_22)
@@ -109,7 +109,7 @@ static struct starpu_perfmodel_t STARPU_PLU(model_22) = {
 starpu_codelet STARPU_PLU(cl22) = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_PLU(cpu_u22),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_PLU(cublas_u22),
 #endif
 	.nbuffers = 3,
@@ -153,7 +153,7 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 	STARPU_PLU(display_data_content)(sub12, nx12);
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 	cudaError_t cures;
 #endif
@@ -164,7 +164,7 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
 					(TYPE)1.0, sub11, ld11, sub12, ld12);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
 					(TYPE)1.0, sub11, ld11, sub12, ld12);
@@ -195,18 +195,18 @@ static void STARPU_PLU(cpu_u12)(void *descr[], void *_args)
 	STARPU_PLU(common_u12)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
 {
 	STARPU_PLU(common_u12)(descr, 1, _args);
 }
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 static struct starpu_perfmodel_t STARPU_PLU(model_12) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_PLU_STR(lu_model_12_goto)
 #else
 	.symbol = STARPU_PLU_STR(lu_model_12)
@@ -216,7 +216,7 @@ static struct starpu_perfmodel_t STARPU_PLU(model_12) = {
 starpu_codelet STARPU_PLU(cl12) = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_PLU(cpu_u12),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_PLU(cublas_u12),
 #endif
 	.nbuffers = 2,
@@ -260,7 +260,7 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 	STARPU_PLU(display_data_content)(sub21, nx21);
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cublasStatus status;
 	cudaError_t cures;
 #endif
@@ -271,7 +271,7 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
 					(TYPE)1.0, sub11, ld11, sub21, ld21);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
 					(TYPE)1.0, sub11, ld11, sub21, ld21);
@@ -304,7 +304,7 @@ static void STARPU_PLU(cpu_u21)(void *descr[], void *_args)
 	STARPU_PLU(common_u21)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
 {
 	STARPU_PLU(common_u21)(descr, 1, _args);
@@ -313,9 +313,9 @@ static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
 
 static struct starpu_perfmodel_t STARPU_PLU(model_21) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_PLU_STR(lu_model_21_goto)
 #else
 	.symbol = STARPU_PLU_STR(lu_model_21)
@@ -325,7 +325,7 @@ static struct starpu_perfmodel_t STARPU_PLU(model_21) = {
 starpu_codelet STARPU_PLU(cl21) = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_PLU(cpu_u21),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_PLU(cublas_u21),
 #endif
 	.nbuffers = 2,
@@ -373,7 +373,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 						&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case 1:
 			for (z = 0; z < nx; z++)
 			{
@@ -409,18 +409,18 @@ static void STARPU_PLU(cpu_u11)(void *descr[], void *_args)
 	STARPU_PLU(common_u11)(descr, 0, _args);
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
 {
 	STARPU_PLU(common_u11)(descr, 1, _args);
 }
-#endif// USE_CUDA
+#endif// STARPU_USE_CUDA
 
 static struct starpu_perfmodel_t STARPU_PLU(model_11) = {
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = STARPU_PLU_STR(lu_model_11_goto)
 #else
 	.symbol = STARPU_PLU_STR(lu_model_11)
@@ -430,7 +430,7 @@ static struct starpu_perfmodel_t STARPU_PLU(model_11) = {
 starpu_codelet STARPU_PLU(cl11) = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = STARPU_PLU(cpu_u11),
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = STARPU_PLU(cublas_u11),
 #endif
 	.nbuffers = 1,

+ 2 - 2
mpi/tests/ring.c

@@ -21,7 +21,7 @@
 unsigned token = 42;
 starpu_data_handle token_handle;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
 
@@ -33,7 +33,7 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 static starpu_codelet increment_cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = increment_cuda,
 #endif
 	.cpu_func = increment_cpu,

+ 2 - 2
mpi/tests/ring_async.c

@@ -21,7 +21,7 @@
 unsigned token = 42;
 starpu_data_handle token_handle;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
 
@@ -33,7 +33,7 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 static starpu_codelet increment_cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = increment_cuda,
 #endif
 	.cpu_func = increment_cpu,

+ 3 - 3
src/Makefile.am

@@ -139,14 +139,14 @@ libstarpu_la_SOURCES = 						\
 	util/starpu_cublas.c					\
 	util/file.c
 
-if USE_CPU
+if STARPU_USE_CPU
 libstarpu_la_SOURCES += drivers/cpu/driver_cpu.c
 endif
 
-if USE_CUDA
+if STARPU_USE_CUDA
 libstarpu_la_SOURCES += drivers/cuda/driver_cuda.c
 endif
 
-if USE_GORDON
+if STARPU_USE_GORDON
 libstarpu_la_SOURCES += drivers/gordon/driver_gordon.c
 endif

+ 1 - 1
src/core/jobs.h

@@ -35,7 +35,7 @@
 #include <core/perfmodel/perfmodel.h>
 #include <core/errorcheck.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #endif
 

+ 1 - 1
src/core/perfmodel/perfmodel.h

@@ -103,7 +103,7 @@ void create_sampling_directory_if_needed(void);
 void load_bus_performance_files(void);
 double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 int *get_gpu_affinity_vector(unsigned gpuid);
 #endif
  

+ 6 - 6
src/core/perfmodel/perfmodel_bus.c

@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
@@ -49,7 +49,7 @@ static int affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
 
 /* Benchmarking the performance of the bus */
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 
@@ -195,7 +195,7 @@ static void measure_bandwith_between_host_and_dev(int dev, unsigned ncpus)
 
 static void benchmark_all_cuda_devices(void)
 {
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	int ret;
 
 #ifdef VERBOSE
@@ -270,7 +270,7 @@ static void load_bus_affinity_file_content(void)
 	f = fopen(path, "r");
 	STARPU_ASSERT(f);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	struct machine_config_s *config = _starpu_get_machine_config();
 	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
 
@@ -320,7 +320,7 @@ static void write_bus_affinity_file_content(void)
 		STARPU_ABORT();
 	}
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	struct machine_config_s *config = _starpu_get_machine_config();
 	unsigned ncpus = _starpu_topology_get_nhwcpu(config);
 	unsigned cpu;
@@ -566,7 +566,7 @@ static void write_bus_bandwith_file_content(void)
 			{
 				bandwith = -1.0;
 			}
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 			else if (src != dst)
 			{
 			/* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/

+ 6 - 6
src/core/topology.c

@@ -34,7 +34,7 @@ static unsigned topology_is_initialized = 0;
 
 static void _starpu_initialize_workers_bindid(struct machine_config_s *config);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void _starpu_initialize_workers_gpuid(struct machine_config_s *config);
 static unsigned may_bind_automatically = 0;
 #endif
@@ -43,7 +43,7 @@ static unsigned may_bind_automatically = 0;
  * Discover the topology of the machine
  */
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void _starpu_initialize_workers_gpuid(struct machine_config_s *config)
 {
 	char *strval;
@@ -173,7 +173,7 @@ static int _starpu_init_machine_config(struct machine_config_s *config,
 
 	_starpu_initialize_workers_bindid(config);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	if (user_conf && (user_conf->ncuda == 0))
 	{
 		/* the user explicitely disabled CUDA */
@@ -222,7 +222,7 @@ static int _starpu_init_machine_config(struct machine_config_s *config,
 	config->nworkers += config->ncudagpus;
 #endif
 	
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	if (user_conf && (user_conf->ncuda != -1)) {
 		explicitval = user_conf->ncuda;
 	}
@@ -486,13 +486,13 @@ static void _starpu_init_workers_binding(struct machine_config_s *config)
 				is_a_set_of_accelerators = 0;
 				memory_node = ram_memory_node;
 				break;
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 				is_a_set_of_accelerators = 1;
 				memory_node = ram_memory_node;
 				break;
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 			case STARPU_CUDA_WORKER:
 				if (may_bind_automatically)
 				{

+ 4 - 4
src/core/workers.c

@@ -60,7 +60,7 @@ inline uint32_t _starpu_worker_may_execute_task(unsigned workerid, uint32_t wher
  * Runtime initialization methods
  */
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 static unsigned gordon_inited = 0;	
 static struct worker_set_s gordon_worker_set;
 #endif
@@ -134,7 +134,7 @@ static void _starpu_init_workers(struct machine_config_s *config)
 						NULL, _starpu_cpu_worker, workerarg);
 				break;
 #endif
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 			case STARPU_CUDA_WORKER:
 				workerarg->set = NULL;
 				workerarg->worker_is_initialized = 0;
@@ -143,7 +143,7 @@ static void _starpu_init_workers(struct machine_config_s *config)
 
 				break;
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 				/* we will only launch gordon once, but it will handle 
 				 * the different SPU workers */
@@ -189,7 +189,7 @@ static void _starpu_init_workers(struct machine_config_s *config)
 					pthread_cond_wait(&workerarg->ready_cond, &workerarg->mutex);
 				pthread_mutex_unlock(&workerarg->mutex);
 				break;
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 				/* the initialization of Gordon worker is
 				 * synchronous for now */

+ 2 - 2
src/core/workers.h

@@ -38,11 +38,11 @@
 #include <hwloc.h>
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <drivers/cuda/driver_cuda.h>
 #endif
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #include <drivers/gordon/driver_gordon.h>
 #endif
 

+ 7 - 7
src/datawizard/copy-driver.c

@@ -90,7 +90,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, uint32_t src_node
 	STARPU_ASSERT(handle->per_node[src_node].allocated);
 	STARPU_ASSERT(handle->per_node[dst_node].allocated);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 cudaError_t cures;
 cudaStream_t *stream;
 #endif
@@ -103,7 +103,7 @@ cudaStream_t *stream;
 				STARPU_ASSERT(copy_methods->ram_to_ram);
 				copy_methods->ram_to_ram(handle, src_node, dst_node);
 				break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 			case CUDA_RAM:
 				/* CUBLAS_RAM -> RAM */
 				/* only the proper CUBLAS thread can initiate this ! */
@@ -144,7 +144,7 @@ cudaStream_t *stream;
 				break;
 		}
 		break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	case CUDA_RAM:
 		switch (src_kind) {
 			case RAM:
@@ -254,13 +254,13 @@ void driver_wait_request_completion(starpu_async_channel *async_channel __attrib
 					unsigned handling_node)
 {
 	node_kind kind = get_node_kind(handling_node);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cudaEvent_t event;
 	cudaError_t cures;
 #endif
 
 	switch (kind) {
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			event = (*async_channel).cuda_event;
 
@@ -285,12 +285,12 @@ unsigned driver_test_request_completion(starpu_async_channel *async_channel __at
 {
 	node_kind kind = get_node_kind(handling_node);
 	unsigned success;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cudaEvent_t event;
 #endif
 
 	switch (kind) {
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			event = (*async_channel).cuda_event;
 

+ 3 - 3
src/datawizard/copy-driver.h

@@ -22,7 +22,7 @@
 #include "coherency.h"
 #include "memalloc.h"
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas.h>
@@ -34,7 +34,7 @@ struct data_request_s;
  * transfer has terminated or not */
 typedef union {
 	int dummy;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cudaEvent_t cuda_event;
 #endif
 } starpu_async_channel;
@@ -55,7 +55,7 @@ struct copy_data_methods_s {
 	int (*spu_to_cuda)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 	int (*spu_to_spu)(starpu_data_handle handle, uint32_t src, uint32_t dst);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	/* for asynchronous CUDA transfers */
 	int (*ram_to_cuda_async)(starpu_data_handle handle, uint32_t src,
 					uint32_t dst, cudaStream_t *stream);

+ 8 - 8
src/datawizard/interfaces/bcsr_interface.c

@@ -27,7 +27,7 @@
  */
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #endif
@@ -35,7 +35,7 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static const struct copy_data_methods_s bcsr_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 #endif
@@ -258,7 +258,7 @@ static size_t allocate_bcsr_buffer_on_node(starpu_data_handle handle, uint32_t d
 				goto fail_rowptr;
 
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaMalloc((void **)&addr_nzval, nnz*r*c*elemsize);
 			if (!addr_nzval)
@@ -293,7 +293,7 @@ fail_rowptr:
 	switch(kind) {
 		case RAM:
 			free((void *)addr_colind);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaFree((void*)addr_colind);
 			break;
@@ -306,7 +306,7 @@ fail_colind:
 	switch(kind) {
 		case RAM:
 			free((void *)addr_nzval);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaFree((void*)addr_nzval);
 			break;
@@ -334,7 +334,7 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
 			free((void*)bcsr_interface->colind);
 			free((void*)bcsr_interface->rowptr);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaFree((void*)bcsr_interface->nzval);
 			cudaFree((void*)bcsr_interface->colind);
@@ -346,7 +346,7 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
 	}
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_bcsr_interface_t *src_bcsr;
@@ -418,7 +418,7 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
 	return 0;
 }
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)

+ 12 - 12
src/datawizard/interfaces/blas_interface.c

@@ -22,13 +22,13 @@
 
 #include <common/hash.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
@@ -38,7 +38,7 @@ static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node,
 static const struct copy_data_methods_s blas_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
@@ -57,7 +57,7 @@ static void liberate_blas_buffer_on_node(void *interface, uint32_t node);
 static size_t blas_interface_get_size(starpu_data_handle handle);
 static uint32_t footprint_blas_interface_crc32(starpu_data_handle handle);
 static void display_blas_interface(starpu_data_handle handle, FILE *f);
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 static int convert_blas_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
 #endif
 
@@ -68,7 +68,7 @@ struct starpu_data_interface_ops_t interface_blas_ops = {
 	.copy_methods = &blas_copy_data_methods_s,
 	.get_size = blas_interface_get_size,
 	.footprint = footprint_blas_interface_crc32,
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	.convert_to_gordon = convert_blas_to_gordon,
 #endif
 	.interfaceid = STARPU_BLAS_INTERFACE_ID, 
@@ -76,7 +76,7 @@ struct starpu_data_interface_ops_t interface_blas_ops = {
 	.display = display_blas_interface
 };
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 static int convert_blas_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
 {
 	size_t elemsize = GET_BLAS_ELEMSIZE(interface);
@@ -219,7 +219,7 @@ static size_t allocate_blas_buffer_on_node(starpu_data_handle handle, uint32_t d
 	unsigned fail = 0;
 	size_t allocated_memory;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cudaError_t status;
 	size_t pitch;
 #endif
@@ -241,7 +241,7 @@ static size_t allocate_blas_buffer_on_node(starpu_data_handle handle, uint32_t d
 				fail = 1;
 
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			status = cudaMallocPitch((void **)&addr, &pitch, (size_t)nx*elemsize, (size_t)ny);
 			if (!addr || status != cudaSuccess)
@@ -279,7 +279,7 @@ static void liberate_blas_buffer_on_node(void *interface, uint32_t node)
 {
 	starpu_blas_interface_t *blas_interface = interface;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
 
@@ -288,7 +288,7 @@ static void liberate_blas_buffer_on_node(void *interface, uint32_t node)
 		case RAM:
 			free((void*)blas_interface->ptr);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			status = cudaFree((void*)blas_interface->ptr);			
 			if (STARPU_UNLIKELY(status))
@@ -301,7 +301,7 @@ static void liberate_blas_buffer_on_node(void *interface, uint32_t node)
 	}
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_blas_interface_t *src_blas;
@@ -421,7 +421,7 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 	return EAGAIN;
 }
 
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)

+ 11 - 11
src/datawizard/interfaces/block_interface.c

@@ -23,7 +23,7 @@
 #include <common/hash.h>
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
@@ -33,7 +33,7 @@ static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node,
 static const struct copy_data_methods_s block_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
@@ -53,7 +53,7 @@ static void liberate_block_buffer_on_node(void *interface, uint32_t node);
 static size_t block_interface_get_size(starpu_data_handle handle);
 static uint32_t footprint_block_interface_crc32(starpu_data_handle handle);
 static void display_block_interface(starpu_data_handle handle, FILE *f);
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 static int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss);
 #endif
 
@@ -64,7 +64,7 @@ struct starpu_data_interface_ops_t interface_block_ops = {
 	.copy_methods = &block_copy_data_methods_s,
 	.get_size = block_interface_get_size,
 	.footprint = footprint_block_interface_crc32,
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	.convert_to_gordon = convert_block_to_gordon,
 #endif
 	.interfaceid = STARPU_BLOCK_INTERFACE_ID, 
@@ -72,7 +72,7 @@ struct starpu_data_interface_ops_t interface_block_ops = {
 	.display = display_block_interface
 };
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
 {
 	/* TODO */
@@ -242,7 +242,7 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 	unsigned fail = 0;
 	size_t allocated_memory;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
 	starpu_block_interface_t *dst_block =
@@ -262,7 +262,7 @@ static size_t allocate_block_buffer_on_node(starpu_data_handle handle, uint32_t
 				fail = 1;
 
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			status = cudaMalloc((void **)&addr, nx*ny*nz*elemsize);
 
@@ -302,7 +302,7 @@ static void liberate_block_buffer_on_node(void *interface, uint32_t node)
 {
 	starpu_block_interface_t *block_interface = interface;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
 
@@ -311,7 +311,7 @@ static void liberate_block_buffer_on_node(void *interface, uint32_t node)
 		case RAM:
 			free((void*)block_interface->ptr);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			status = cudaFree((void*)block_interface->ptr);
 			if (STARPU_UNLIKELY(status))
@@ -324,7 +324,7 @@ static void liberate_block_buffer_on_node(void *interface, uint32_t node)
 	}
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	cudaError_t cures;
@@ -658,7 +658,7 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
 	return 0;
 }
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)

+ 8 - 8
src/datawizard/interfaces/csr_interface.c

@@ -24,7 +24,7 @@
 
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #endif
@@ -32,7 +32,7 @@ static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32
 static const struct copy_data_methods_s csr_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 #endif
@@ -230,7 +230,7 @@ static size_t allocate_csr_buffer_on_node(starpu_data_handle handle, uint32_t ds
 				goto fail_rowptr;
 
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaMalloc((void **)&addr_nzval, nnz*elemsize);
 			if (!addr_nzval)
@@ -265,7 +265,7 @@ fail_rowptr:
 	switch(kind) {
 		case RAM:
 			free((void *)addr_colind);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaFree((void*)addr_colind);
 			break;
@@ -278,7 +278,7 @@ fail_colind:
 	switch(kind) {
 		case RAM:
 			free((void *)addr_nzval);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaFree((void*)addr_nzval);
 			break;
@@ -306,7 +306,7 @@ static void liberate_csr_buffer_on_node(void *interface, uint32_t node)
 			free((void*)csr_interface->colind);
 			free((void*)csr_interface->rowptr);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaFree((void*)csr_interface->nzval);
 			cudaFree((void*)csr_interface->colind);
@@ -318,7 +318,7 @@ static void liberate_csr_buffer_on_node(void *interface, uint32_t node)
 	}
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_csr_interface_t *src_csr;
@@ -384,7 +384,7 @@ static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32
 
 	return 0;
 }
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 /* as not all platform easily have a BLAS lib installed ... */
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)

+ 2 - 2
src/datawizard/interfaces/data_interface.h

@@ -20,7 +20,7 @@
 #include <starpu.h>
 #include <common/config.h>
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 /* to get the gordon_strideSize_t data structure from gordon */
 #include <gordon.h>
 #endif
@@ -34,7 +34,7 @@ struct starpu_data_interface_ops_t {
 	size_t (*get_size)(starpu_data_handle handle);
 	uint32_t (*footprint)(starpu_data_handle handle);
 	void (*display)(starpu_data_handle handle, FILE *f);
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	int (*convert_to_gordon)(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
 #endif
 	/* an identifier that is unique to each interface */

+ 11 - 11
src/datawizard/interfaces/vector_interface.c

@@ -22,12 +22,12 @@
 
 #include <common/hash.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #endif
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
@@ -37,7 +37,7 @@ static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node,
 static const struct copy_data_methods_s vector_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
@@ -56,7 +56,7 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node);
 static size_t vector_interface_get_size(starpu_data_handle handle);
 static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle);
 static void display_vector_interface(starpu_data_handle handle, FILE *f);
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 static int convert_vector_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
 #endif
 
@@ -67,7 +67,7 @@ struct starpu_data_interface_ops_t interface_vector_ops = {
 	.copy_methods = &vector_copy_data_methods_s,
 	.get_size = vector_interface_get_size,
 	.footprint = footprint_vector_interface_crc32,
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	.convert_to_gordon = convert_vector_to_gordon,
 #endif
 	.interfaceid = STARPU_VECTOR_INTERFACE_ID,
@@ -97,7 +97,7 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 	}
 }
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 int convert_vector_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
 {
 	starpu_vector_interface_t *vector_interface = interface;
@@ -194,7 +194,7 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 
 	node_kind kind = get_node_kind(dst_node);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	cudaError_t status;
 #endif
 
@@ -204,7 +204,7 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 			if (!addr)
 				fail = 1;
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			status = cudaMalloc((void **)&addr, nx*elemsize);
 			if (!addr || (status != cudaSuccess))
@@ -241,7 +241,7 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node)
 		case RAM:
 			free((void*)vector_interface->ptr);
 			break;
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		case CUDA_RAM:
 			cudaFree((void*)vector_interface->ptr);
 			break;
@@ -251,7 +251,7 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node)
 	}
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_vector_interface_t *src_vector;
@@ -348,7 +348,7 @@ static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node,
 }
 
 
-#endif // USE_CUDA
+#endif // STARPU_USE_CUDA
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {

+ 1 - 1
src/datawizard/memory_nodes.h

@@ -20,7 +20,7 @@
 #include "coherency.h"
 #include "memalloc.h"
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cublas.h>
 #endif
 

+ 2 - 2
src/drivers/cuda/driver_cuda.h

@@ -21,7 +21,7 @@
 #include <math.h>
 #include <stdio.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cublas.h>
@@ -38,7 +38,7 @@
 
 unsigned get_cuda_device_count(void);
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void _starpu_init_cuda(void);
 void *_starpu_cuda_worker(void *);
 #endif

+ 2 - 2
src/task-models/blas_model.c

@@ -49,9 +49,9 @@ static double gemm_cost(starpu_buffer_descr *descr)
 struct starpu_perfmodel_t sgemm_model = {
 	.cost_model = gemm_cost,
 	.type = STARPU_HISTORY_BASED,
-#ifdef ATLAS
+#ifdef STARPU_ATLAS
 	.symbol = "sgemm_atlas"
-#elif defined(GOTO)
+#elif defined(STARPU_GOTO)
 	.symbol = "sgemm_goto"
 #else
 	.symbol = "sgemm"

+ 5 - 5
src/util/malloc.c

@@ -20,11 +20,11 @@
 #include <common/config.h>
 #include <starpu.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #endif
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 struct malloc_pinned_codelet_struct {
 	void **ptr;
 	size_t dim;
@@ -57,7 +57,7 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 
 	if (may_submit_cuda_task())
 	{
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		int push_res;
 	
 		struct malloc_pinned_codelet_struct s = {
@@ -85,7 +85,7 @@ int starpu_malloc_pinned_if_possible(void **A, size_t dim)
 	return 0;
 }
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void free_pinned_codelet(void *buffers[] __attribute__((unused)), void *arg)
 {
 	cudaError_t cures;
@@ -109,7 +109,7 @@ int starpu_free_pinned_if_possible(void *A)
 
 	if (may_submit_cuda_task())
 	{
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 		int push_res;
 	
 		struct starpu_task *task = starpu_task_create();

+ 3 - 3
src/util/starpu_cublas.c

@@ -17,7 +17,7 @@
 #include <starpu.h>
 #include <common/config.h>
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void init_cublas_func(void *args __attribute__((unused)))
 {
 	cublasStatus cublasst = cublasInit();
@@ -33,14 +33,14 @@ static void shutdown_cublas_func(void *args __attribute__((unused)))
 
 void starpu_helper_init_cublas(void)
 {
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
 #endif
 }
 
 void starpu_helper_shutdown_cublas(void)
 {
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	starpu_execute_on_each_worker(shutdown_cublas_func, NULL, STARPU_CUDA);
 #endif
 }

+ 4 - 4
tests/Makefile.am

@@ -28,7 +28,7 @@ CLEANFILES = 					\
 
 BUILT_SOURCES =
 
-if USE_CUDA
+if STARPU_USE_CUDA
 
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
@@ -45,7 +45,7 @@ NVCCFLAGS += -I$(top_srcdir)/include/
 
 endif
 
-if USE_GORDON
+if STARPU_USE_GORDON
 
 SPU_CC ?= spu-gcc
 SPU_LD ?= spu-ld
@@ -200,12 +200,12 @@ microbenchs_redundant_buffer_SOURCES =		\
 microbenchs_local_pingpong_SOURCES =		\
 	microbenchs/local_pingpong.c
 
-if USE_CUDA
+if STARPU_USE_CUDA
 datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_kernels.cu
 endif
 
-if USE_GORDON
+if STARPU_USE_GORDON
 datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_gordon_kernels.c
 

+ 2 - 2
tests/core/starpu_wait_all_tasks.c

@@ -31,7 +31,7 @@ static starpu_codelet dummy_codelet =
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 	.cpu_func = dummy_func,
 	.cuda_func = dummy_func,
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	.gordon_func = 0, /* this will be defined later */
 #endif
 	.model = NULL,
@@ -40,7 +40,7 @@ static starpu_codelet dummy_codelet =
 
 static void init_gordon_kernel(void)
 {
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	unsigned elf_id = 
 		gordon_register_elf_plugin("./microbenchs/null_kernel_gordon.spuelf");
 	gordon_load_plugin_on_all_spu(elf_id);

+ 7 - 7
tests/datawizard/sync_and_notify_data.c

@@ -19,7 +19,7 @@
 #include <errno.h>
 #include <starpu.h>
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 #include <gordon.h>
 #endif
 
@@ -41,7 +41,7 @@
  *  v == (kn, k, kn)
  */
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 void cuda_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args);
 void cuda_codelet_incC(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
@@ -67,7 +67,7 @@ int main(int argc, char **argv)
 {
 	starpu_init(NULL);
 
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 	unsigned elf_id = gordon_register_elf_plugin("./datawizard/sync_and_notify_data_gordon_kernels.spuelf");
 	gordon_load_plugin_on_all_spu(elf_id);
 
@@ -93,10 +93,10 @@ int main(int argc, char **argv)
 			starpu_codelet cl_inc_a = {
 				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 				.cpu_func = cpu_codelet_incA,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 				.cuda_func = cuda_codelet_incA,
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 				.gordon_func = kernel_incA_id,
 #endif
 				.nbuffers = 1
@@ -129,10 +129,10 @@ int main(int argc, char **argv)
 			starpu_codelet cl_inc_c = {
 				.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
 				.cpu_func = cpu_codelet_incC,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 				.cuda_func = cuda_codelet_incC,
 #endif
-#ifdef USE_GORDON
+#ifdef STARPU_USE_GORDON
 				.gordon_func = kernel_incC_id,
 #endif
 				.nbuffers = 1

+ 1 - 1
tests/datawizard/sync_with_data_with_mem.c

@@ -35,7 +35,7 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 static starpu_codelet cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dummy_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 #endif
 	.nbuffers = 1

+ 1 - 1
tests/datawizard/sync_with_data_with_mem_non_blocking.c

@@ -35,7 +35,7 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 static starpu_codelet cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dummy_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 #endif
 	.nbuffers = 1

+ 1 - 1
tests/datawizard/unpartition.c

@@ -34,7 +34,7 @@ static void dummy_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 static starpu_codelet cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = dummy_codelet,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = dummy_codelet,
 #endif
 	.nbuffers = 1

+ 2 - 2
tests/datawizard/write_only_tmp_buffer.c

@@ -24,7 +24,7 @@
 
 starpu_data_handle v_handle;
 
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
 {
 	char *buf = (char *)STARPU_GET_VECTOR_PTR(descr[0]);
@@ -53,7 +53,7 @@ static void display_var(void *descr[], __attribute__ ((unused)) void *_args)
 static starpu_codelet cl = {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = cpu_codelet_null,
-#ifdef USE_CUDA
+#ifdef STARPU_USE_CUDA
 	.cuda_func = cuda_codelet_null,
 #endif
 	.nbuffers = 1

+ 1 - 1
tests/heat/deps.sh

@@ -35,7 +35,7 @@ trace_deps()
 		filename=$TIMINGDIR/deps.v$DEPS.$size
 		#rm -f $filename
 		make clean 1> /dev/null 2> /dev/null
-		make examples -j ATLAS=1 CPUS=$MAXCPU CUDA=1 1> /dev/null 2> /dev/null
+		make examples -j STARPU_ATLAS=1 CPUS=$MAXCPU CUDA=1 1> /dev/null 2> /dev/null
 		cd $DIR
 		
 		for iter in `seq 1 $maxiter`

+ 1 - 1
tests/heat/gflops-sched.sh

@@ -106,7 +106,7 @@ trace_size()
 cd $ROOTDIR
 
 make clean 1> /dev/null 2> /dev/null
-make examples ATLAS=1 CUDA=1 CPUS=3 1> /dev/null 2> log
+make examples STARPU_ATLAS=1 CUDA=1 CPUS=3 1> /dev/null 2> log
 
 cd $DIR
 

+ 1 - 1
tests/heat/gflops.sh

@@ -105,7 +105,7 @@ trace_size()
 cd $ROOTDIR
 
 make clean 1> /dev/null 2> /dev/null
-make ATLAS=1 CUDA=1 CPUS=3 1> /dev/null 2> /dev/null
+make STARPU_ATLAS=1 CUDA=1 CPUS=3 1> /dev/null 2> /dev/null
 
 cd $DIR
 

+ 1 - 1
tests/heat/heat.sh

@@ -25,7 +25,7 @@ mkdir -p $TIMINGDIR
 cd $ROOTDIR
 
 make clean 1> /dev/null 2> /dev/null
-make ATLAS=1 CUDA=1 CPUS=4 1> /dev/null 2> /dev/null
+make STARPU_ATLAS=1 CUDA=1 CPUS=4 1> /dev/null 2> /dev/null
 
 BLOCKS=8
 

+ 1 - 1
tests/heat/model-perturbation.sh

@@ -42,7 +42,7 @@ trace_perturbation()
 		filename=$TIMINGDIR/pertubate.$size.$AMPL
 		#rm -f $filename
 		make clean 1> /dev/null 2> /dev/null
-		make examples -j ATLAS=1 CPUS=$MAXCPU CUDA=1 PERTURB_AMPL=$AMPL 1> /dev/null 2> /dev/null
+		make examples -j STARPU_ATLAS=1 CPUS=$MAXCPU CUDA=1 PERTURB_AMPL=$AMPL 1> /dev/null 2> /dev/null
 		cd $DIR
 
 		if [ $size -le 16384 ]

+ 1 - 1
tests/heat/speedup.sh

@@ -25,7 +25,7 @@ mkdir -p $TIMINGDIR
 cd $ROOTDIR
 
 make clean 1> /dev/null 2> /dev/null
-make ATLAS=1 CPUS=16 1> /dev/null 2> /dev/null
+make STARPU_ATLAS=1 CPUS=16 1> /dev/null 2> /dev/null
 
 echo "speedup ..."
 

+ 1 - 1
tests/memory/memstress.sh

@@ -57,7 +57,7 @@ trace_stress()
 cd $ROOTDIR
 
 make clean 1> /dev/null 2> /dev/null
-make examples ATLAS=1 CUDA=1 CPUS=3 1> /dev/null #2> /dev/null
+make examples STARPU_ATLAS=1 CUDA=1 CPUS=3 1> /dev/null #2> /dev/null
 
 cd $DIR
 

+ 1 - 1
tests/memory/memstress2.sh

@@ -56,7 +56,7 @@ trace_stress()
 cd $ROOTDIR
 
 make clean 1> /dev/null 2> /dev/null
-make examples ATLAS=1 CUDA=1 CPUS=0 1> /dev/null 2> /dev/null
+make examples STARPU_ATLAS=1 CUDA=1 CPUS=0 1> /dev/null 2> /dev/null
 
 cd $DIR
 

+ 0 - 0
tests/microbenchs/async-tasks-overhead.c


Some files were not shown because too many files changed in this diff