浏览代码

Do not compile CUDA code in double mode if double type is not available. Dynamically rule out CUDA devices which do not support capabilities >= 1.3

Samuel Thibault 13 年之前
父节点
当前提交
8a29b06414
共有 5 个文件被更改,包括 79 次插入30 次删除
  1. 4 1
      starpufft/double.h
  2. 4 1
      starpufft/float.h
  3. 32 1
      starpufft/starpufftx.c
  4. 19 13
      starpufft/starpufftx1d.c
  5. 20 14
      starpufft/starpufftx2d.c

+ 4 - 1
starpufft/double.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,6 +26,9 @@
 #include <cufft.h>
 #endif
 
+#undef  FLOAT
+#define DOUBLE
+
 typedef double real;
 #ifdef STARPU_HAVE_FFTW
 typedef fftw_complex _fftw_complex;

+ 4 - 1
starpufft/float.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,6 +26,9 @@
 #include <cufft.h>
 #endif
 
+#undef  DOUBLE
+#define FLOAT
+
 typedef float real;
 #ifdef STARPU_HAVE_FFTW
 typedef fftwf_complex _fftw_complex;

+ 32 - 1
starpufft/starpufftx.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -29,6 +29,13 @@
 #ifdef STARPU_USE_CUDA
 #define _externC extern
 #include "cudax_kernels.h"
+
+#if defined(FLOAT) || defined(STARPU_HAVE_CUFFTDOUBLECOMPLEX)
+#  define __STARPU_USE_CUDA
+#else
+#  undef __STARPU_USE_CUDA
+#endif
+
 #endif
 
 #define _FFTW_FLAGS FFTW_ESTIMATE
@@ -166,6 +173,30 @@ compute_roots(STARPUFFT(plan) plan)
 	}
 }
 
+/* Only CUDA capability >= 1.3 supports doubles, rule old card out.  */
+#ifdef DOUBLE
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl) {
+	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+		return 1;
+#ifdef STARPU_USE_CUDA
+	{
+		/* Cuda device */
+		const struct cudaDeviceProp *props;
+		props = starpu_cuda_get_device_properties(workerid);
+		if (props->major >= 2 || props->minor >= 3)
+			/* At least compute capability 1.3, supports doubles */
+			return 1;
+		/* Old card does not support doubles */
+		return 0;
+	}
+#endif
+	return 0;
+}
+#define CAN_EXECUTE .can_execute = can_execute,
+#else
+#define CAN_EXECUTE
+#endif
+
 #include "starpufftx1d.c"
 #include "starpufftx2d.c"
 

+ 19 - 13
starpufft/starpufftx1d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -59,7 +59,7 @@
 
 #define STEP_TAG_1D(plan, step, i) _STEP_TAG(plan, step, i)
 
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 /* twist1:
  *
  * Twist the full input vector (first parameter) into one chunk of size n2
@@ -321,14 +321,15 @@ static struct starpu_perfmodel STARPUFFT(twist3_1d_model) = {
 /* codelet pointers for the 5 kinds of tasks */
 static struct starpu_codelet STARPUFFT(twist1_1d_codelet) = {
 	.where =
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 		STARPU_CPU,
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	.cuda_funcs = {STARPUFFT(twist1_1d_kernel_gpu), NULL},
 #endif
 	.cpu_funcs = {STARPUFFT(twist1_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
 	.model = &STARPUFFT(twist1_1d_model),
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W}
@@ -336,19 +337,20 @@ static struct starpu_codelet STARPUFFT(twist1_1d_codelet) = {
 
 static struct starpu_codelet STARPUFFT(fft1_1d_codelet) = {
 	.where =
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef STARPU_HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	.cuda_funcs = {STARPUFFT(fft1_1d_kernel_gpu), NULL},
 #endif
 #ifdef STARPU_HAVE_FFTW
 	.cpu_funcs = {STARPUFFT(fft1_1d_kernel_cpu), NULL},
 #endif
+	CAN_EXECUTE
 	.model = &STARPUFFT(fft1_1d_model),
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_W, STARPU_R}
@@ -357,6 +359,7 @@ static struct starpu_codelet STARPUFFT(fft1_1d_codelet) = {
 static struct starpu_codelet STARPUFFT(twist2_1d_codelet) = {
 	.where = STARPU_CPU,
 	.cpu_funcs = {STARPUFFT(twist2_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
 	.model = &STARPUFFT(twist2_1d_model),
 	.nbuffers = 1,
 	.modes = {STARPU_W}
@@ -364,19 +367,20 @@ static struct starpu_codelet STARPUFFT(twist2_1d_codelet) = {
 
 static struct starpu_codelet STARPUFFT(fft2_1d_codelet) = {
 	.where =
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef STARPU_HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	.cuda_funcs = {STARPUFFT(fft2_1d_kernel_gpu), NULL},
 #endif
 #ifdef STARPU_HAVE_FFTW
 	.cpu_funcs = {STARPUFFT(fft2_1d_kernel_cpu), NULL},
 #endif
+	CAN_EXECUTE
 	.model = &STARPUFFT(fft2_1d_model),
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W}
@@ -385,6 +389,7 @@ static struct starpu_codelet STARPUFFT(fft2_1d_codelet) = {
 static struct starpu_codelet STARPUFFT(twist3_1d_codelet) = {
 	.where = STARPU_CPU,
 	.cpu_funcs = {STARPUFFT(twist3_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
 	.model = &STARPUFFT(twist3_1d_model),
 	.nbuffers = 1,
 	.modes = {STARPU_R}
@@ -396,7 +401,7 @@ static struct starpu_codelet STARPUFFT(twist3_1d_codelet) = {
  *
  */
 
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 /* Perform one fft of size n */
 static void
 STARPUFFT(fft_1d_plan_gpu)(void *args)
@@ -456,19 +461,20 @@ static struct starpu_perfmodel STARPUFFT(fft_1d_model) = {
 
 static struct starpu_codelet STARPUFFT(fft_1d_codelet) = {
 	.where =
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef STARPU_HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	.cuda_funcs = {STARPUFFT(fft_1d_kernel_gpu), NULL},
 #endif
 #ifdef STARPU_HAVE_FFTW
 	.cpu_funcs = {STARPUFFT(fft_1d_kernel_cpu), NULL},
 #endif
+	CAN_EXECUTE
 	.model = &STARPUFFT(fft_1d_model),
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W}
@@ -495,7 +501,7 @@ STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
 	struct starpu_task *task;
 
 if (PARALLEL) {
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	/* cufft 1D limited to 8M elements */
 	while (n2 > 8 << 20) {
 		n1 *= 2;
@@ -589,7 +595,7 @@ if (PARALLEL) {
 			break;
 		}
 	}
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 if (PARALLEL) {
 	starpu_execute_on_each_worker(STARPUFFT(fft1_1d_plan_gpu), plan, STARPU_CUDA);
 	starpu_execute_on_each_worker(STARPUFFT(fft2_1d_plan_gpu), plan, STARPU_CUDA);

+ 20 - 14
starpufft/starpufftx2d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,7 +23,7 @@
 
 #define STEP_TAG_2D(plan, step, i, j) _STEP_TAG(plan, step, ((starpu_tag_t) i << I_SHIFT) | (starpu_tag_t) j)
 
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 /* Twist the full vector into a n2,m2 chunk */
 static void
 STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
@@ -306,14 +306,15 @@ struct starpu_perfmodel STARPUFFT(twist3_2d_model) = {
 
 static struct starpu_codelet STARPUFFT(twist1_2d_codelet) = {
 	.where =
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 		STARPU_CPU,
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	.cuda_funcs = {STARPUFFT(twist1_2d_kernel_gpu), NULL},
 #endif
 	.cpu_funcs = {STARPUFFT(twist1_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
 	.model = &STARPUFFT(twist1_2d_model),
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W}
@@ -321,19 +322,20 @@ static struct starpu_codelet STARPUFFT(twist1_2d_codelet) = {
 
 static struct starpu_codelet STARPUFFT(fft1_2d_codelet) = {
 	.where =
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef STARPU_HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	.cuda_funcs = {STARPUFFT(fft1_2d_kernel_gpu), NULL},
 #endif
 #ifdef STARPU_HAVE_FFTW
 	.cpu_funcs = {STARPUFFT(fft1_2d_kernel_cpu), NULL},
 #endif
+	CAN_EXECUTE
 	.model = &STARPUFFT(fft1_2d_model),
 	.nbuffers = 4,
 	.modes = {STARPU_R, STARPU_W, STARPU_R, STARPU_R}
@@ -342,6 +344,7 @@ static struct starpu_codelet STARPUFFT(fft1_2d_codelet) = {
 static struct starpu_codelet STARPUFFT(twist2_2d_codelet) = {
 	.where = STARPU_CPU,
 	.cpu_funcs = {STARPUFFT(twist2_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
 	.model = &STARPUFFT(twist2_2d_model),
 	.nbuffers = 1,
 	.modes = {STARPU_W}
@@ -349,19 +352,20 @@ static struct starpu_codelet STARPUFFT(twist2_2d_codelet) = {
 
 static struct starpu_codelet STARPUFFT(fft2_2d_codelet) = {
 	.where =
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef STARPU_HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	.cuda_funcs = {STARPUFFT(fft2_2d_kernel_gpu), NULL},
 #endif
 #ifdef STARPU_HAVE_FFTW
 	.cpu_funcs = {STARPUFFT(fft2_2d_kernel_cpu), NULL},
 #endif
+	CAN_EXECUTE
 	.model = &STARPUFFT(fft2_2d_model),
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W}
@@ -370,6 +374,7 @@ static struct starpu_codelet STARPUFFT(fft2_2d_codelet) = {
 static struct starpu_codelet STARPUFFT(twist3_2d_codelet) = {
 	.where = STARPU_CPU,
 	.cpu_funcs = {STARPUFFT(twist3_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
 	.model = &STARPUFFT(twist3_2d_model),
 	.nbuffers = 1,
 	.modes = {STARPU_R}
@@ -381,7 +386,7 @@ static struct starpu_codelet STARPUFFT(twist3_2d_codelet) = {
  *
  */
 
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 /* Perform one fft of size n,m */
 static void
 STARPUFFT(fft_2d_plan_gpu)(void *args)
@@ -442,19 +447,20 @@ static struct starpu_perfmodel STARPUFFT(fft_2d_model) = {
 
 static struct starpu_codelet STARPUFFT(fft_2d_codelet) = {
 	.where =
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 		STARPU_CUDA|
 #endif
 #ifdef STARPU_HAVE_FFTW
 		STARPU_CPU|
 #endif
 		0,
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	.cuda_funcs = {STARPUFFT(fft_2d_kernel_gpu), NULL},
 #endif
 #ifdef STARPU_HAVE_FFTW
 	.cpu_funcs = {STARPUFFT(fft_2d_kernel_cpu), NULL},
 #endif
+	CAN_EXECUTE
 	.model = &STARPUFFT(fft_2d_model),
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W}
@@ -485,7 +491,7 @@ if (PARALLEL) {
 	 * - twist3: twist back into output
 	 */
 
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	/* cufft 2D-3D limited to [2,16384] */
 	while (n2 > 16384) {
 		n1 *= 2;
@@ -496,7 +502,7 @@ if (PARALLEL) {
 	STARPU_ASSERT(n1 < (1ULL << J_BITS));
 
 
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 	/* cufft 2D-3D limited to [2,16384] */
 	while (m2 > 16384) {
 		m1 *= 2;
@@ -591,7 +597,7 @@ if (PARALLEL) {
 			break;
 		}
 	}
-#ifdef STARPU_USE_CUDA
+#ifdef __STARPU_USE_CUDA
 if (PARALLEL) {
 	starpu_execute_on_each_worker(STARPUFFT(fft1_2d_plan_gpu), plan, STARPU_CUDA);
 	starpu_execute_on_each_worker(STARPUFFT(fft2_2d_plan_gpu), plan, STARPU_CUDA);