16 년 전 · 76f2fd6053
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -165,6 +165,12 @@ examplebin_PROGRAMS +=				\
 
				 pi_pi_SOURCES =					\
			
 
				 	pi/pi.c
			
 
				 
			
 
				+if STARPU_USE_CUDA
			
 
				+pi_pi_SOURCES +=				\
			
 
				+	pi/pi_kernel.cu
			
 
				+endif
			
 
				+
			
 
				+
			
 
				 ################
			
 
				 # AXPY example #
			
 
				 ################
			
--- a/examples/pi/pi.c
+++ b/examples/pi/pi.c
@@ -14,15 +14,11 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu.h>
			
 
				-#include <stdio.h>
			
 
				+#include "pi.h"
			
 
				 
			
 
				-#define NTASKS	(32*1024)
			
 
				-#define NSHOT_PER_TASK	1024
			
 
				-
			
 
				-#define SIZE	(NTASKS*NSHOT_PER_TASK)
			
 
				-
			
 
				-#define TYPE	float
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void cuda_kernel(void **descr, void *cl_arg);
			
 
				+#endif
			
 
				 
			
 
				 static void cpu_kernel(void *descr[], void *cl_arg)
			
 
				 {
			
@@ -95,9 +91,9 @@ int main(int argc, char **argv)
 
				 	starpu_partition_data(cnt_array_handle, &f);
			
 
				 
			
 
				 	struct starpu_codelet_t cl = {
			
 
				-		.where = STARPU_CPU,
			
 
				+		.where = STARPU_CPU|STARPU_CUDA,
			
 
				 		.cpu_func = cpu_kernel,
			
 
				-		.cuda_func = NULL, /* TODO */
			
 
				+		.cuda_func = cuda_kernel,
			
 
				 		.nbuffers = 3,
			
 
				 		.model = NULL /* TODO */
			
 
				 	};
			
--- a/examples/pi/pi.h
+++ b/examples/pi/pi.h
@@ -0,0 +1,32 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PI_H__
			
 
				+#define __PI_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#define NTASKS	(32*1024)
			
 
				+#define NSHOT_PER_TASK	1024
			
 
				+
			
 
				+#define SIZE	(NTASKS*NSHOT_PER_TASK)
			
 
				+
			
 
				+#define TYPE	float
			
 
				+
			
 
				+//extern "C" void cuda_kernel(void *descr[], void *cl_arg);
			
 
				+
			
 
				+#endif // __PI_H__
			
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -0,0 +1,89 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "pi.h"
			
 
				+
			
 
				+#define BLOCK_SIZE	256
			
 
				+
			
 
				+static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
			
 
				+						unsigned n, unsigned *output_cnt)
			
 
				+{
			
 
				+	__shared__ unsigned scnt[BLOCK_SIZE];
			
 
				+
			
 
				+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
			
 
				+
			
 
				+	/* Blank the shared mem buffer */
			
 
				+	if (threadIdx.x < 32)
			
 
				+		scnt[threadIdx.x] = 0;
			
 
				+
			
 
				+	__syncthreads();
			
 
				+
			
 
				+	/* Do we have a successful shot ? */
			
 
				+	TYPE x = random_numbers_x[tid];
			
 
				+	TYPE y = random_numbers_y[tid];
			
 
				+	TYPE dist = (x*x + y*y);
			
 
				+
			
 
				+	scnt[threadIdx.x] = (dist <= 1.0f);
			
 
				+
			
 
				+	__syncthreads();
			
 
				+
			
 
				+	/* XXX that's totally unoptimized : we should do a reduction ! */
			
 
				+	if (threadIdx.x == 0)
			
 
				+	{
			
 
				+		unsigned total_cnt = 0;
			
 
				+		unsigned i;
			
 
				+		for (i = 0; i < BLOCK_SIZE; i++)
			
 
				+			total_cnt += scnt[i];
			
 
				+
			
 
				+		output_cnt[blockIdx.x] = total_cnt;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static __global__ void sum_per_block_cnt(unsigned previous_nblocks,
			
 
				+						unsigned *output_cnt,
			
 
				+						unsigned *cnt)
			
 
				+{
			
 
				+	/* XXX that's totally unoptimized yet : we should do a reduction ! */
			
 
				+	if (threadIdx.x == 0)
			
 
				+	{
			
 
				+		unsigned total_cnt = 0;
			
 
				+		unsigned i;
			
 
				+		for (i = 0; i < previous_nblocks; i++)
			
 
				+			total_cnt += output_cnt[i];
			
 
				+
			
 
				+		*cnt = total_cnt;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+extern "C" void cuda_kernel(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	TYPE *random_numbers_x = (TYPE *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				+	TYPE *random_numbers_y = (TYPE *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				+	unsigned nx = STARPU_GET_VECTOR_NX(descr[0]);
			
 
				+
			
 
				+	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[2]);
			
 
				+	
			
 
				+	unsigned *per_block_cnt;
			
 
				+	cudaMalloc((void **)&per_block_cnt, (nx/BLOCK_SIZE)*sizeof(unsigned));
			
 
				+
			
 
				+	monte_carlo<<<nx/BLOCK_SIZE, BLOCK_SIZE>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				+	cudaThreadSynchronize();
			
 
				+
			
 
				+	sum_per_block_cnt<<<1, 32>>>(nx/BLOCK_SIZE, per_block_cnt, cnt);
			
 
				+	cudaThreadSynchronize();
			
 
				+
			
 
				+	cudaFree(per_block_cnt);
			
 
				+}