15 lat temu · bc49f72836
--- a/examples/pi/pi.h
+++ b/examples/pi/pi.h
@@ -20,8 +20,8 @@
 
				 #include <starpu.h>
			
 
				 #include <stdio.h>
			
 
				 
			
 
				-#define NTASKS	(128*1024)
			
 
				-#define NSHOT_PER_TASK	(1024)
			
 
				+#define NTASKS	(64ULL)
			
 
				+#define NSHOT_PER_TASK	(16*1024*1024ULL)
			
 
				 
			
 
				 #define SIZE	(NTASKS*NSHOT_PER_TASK)
			
 
				 
			
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -16,31 +16,42 @@
 
				 
			
 
				 #include "pi.h"
			
 
				 
			
 
				-#define BLOCK_SIZE	256
			
 
				+#define MAXNBLOCKS	128
			
 
				+#define MAXTHREADSPERBLOCK	256
			
 
				 
			
 
				 static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
			
 
				 						unsigned n, unsigned *output_cnt)
			
 
				 {
			
 
				-	__shared__ unsigned scnt[BLOCK_SIZE];
			
 
				+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
			
 
				 
			
 
				+	/* Do we have a successful shot ? */
			
 
				 	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
			
 
				 
			
 
				+	const int nthreads = gridDim.x * blockDim.x;
			
 
				+
			
 
				 	/* Blank the shared mem buffer */
			
 
				-	if (threadIdx.x < 32)
			
 
				+	if (threadIdx.x < MAXTHREADSPERBLOCK)
			
 
				 		scnt[threadIdx.x] = 0;
			
 
				 
			
 
				 	__syncthreads();
			
 
				+	int ind;
			
 
				+	for (ind = tid; ind < n; ind += nthreads)
			
 
				+	{ 
			
 
				+		TYPE x = random_numbers_x[ind];
			
 
				+		TYPE y = random_numbers_y[ind];
			
 
				+		TYPE dist = (x*x + y*y);
			
 
				 
			
 
				-	/* Do we have a successful shot ? */
			
 
				-	TYPE x = random_numbers_x[tid];
			
 
				-	TYPE y = random_numbers_y[tid];
			
 
				-	TYPE dist = (x*x + y*y);
			
 
				+		unsigned success = (dist <= 1.0f)?1:0;
			
 
				 
			
 
				-	scnt[threadIdx.x] = (dist <= 1.0f);
			
 
				+		scnt[threadIdx.x] += success;
			
 
				+
			
 
				+	}
			
 
				 
			
 
				 	__syncthreads();
			
 
				 
			
 
				 	/* Perform a reduction to compute the sum on each thread within that block */
			
 
				+
			
 
				+	/* NB: We assume that the number of threads per block is a power of 2 ! */
			
 
				 	unsigned s;
			
 
				 	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				 	{
			
@@ -53,46 +64,73 @@ static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_
 
				 	/* report the number of successful shots in the block */
			
 
				 	if (threadIdx.x == 0)
			
 
				 		output_cnt[blockIdx.x] = scnt[0];
			
 
				+
			
 
				+	__syncthreads();
			
 
				 }
			
 
				 
			
 
				-static __global__ void sum_per_block_cnt(unsigned previous_nblocks,
			
 
				-						unsigned *output_cnt,
			
 
				-						unsigned *cnt)
			
 
				+static __global__ void sum_per_block_cnt(unsigned *output_cnt, unsigned *cnt)
			
 
				 {
			
 
				-	/* XXX that's totally unoptimized yet : we should do a reduction ! */
			
 
				-	if (threadIdx.x == 0)
			
 
				+	__shared__ unsigned accumulator[MAXNBLOCKS];
			
 
				+
			
 
				+	unsigned i;
			
 
				+
			
 
				+	/* Load the values from global mem */
			
 
				+	for (i = 0; i < blockDim.x; i++)
			
 
				+		accumulator[i] = output_cnt[i];
			
 
				+
			
 
				+	__syncthreads();
			
 
				+
			
 
				+	/* Perform a reduction in shared memory */
			
 
				+	unsigned s;
			
 
				+	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				 	{
			
 
				-		unsigned total_cnt = 0;
			
 
				-		unsigned i;
			
 
				-		for (i = 0; i < previous_nblocks; i++)
			
 
				-			total_cnt += output_cnt[i];
			
 
				+		if (threadIdx.x < s)
			
 
				+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
			
 
				 
			
 
				-		*cnt = total_cnt;
			
 
				+		__syncthreads();
			
 
				 	}
			
 
				+
			
 
				+	/* Save the result in global memory */
			
 
				+	if (threadIdx.x == 0)
			
 
				+		*cnt = accumulator[0];
			
 
				 }
			
 
				 
			
 
				 extern "C" void cuda_kernel(void *descr[], void *cl_arg)
			
 
				 {
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				 	TYPE *random_numbers_x = (TYPE *)STARPU_GET_VECTOR_PTR(descr[0]);
			
 
				 	TYPE *random_numbers_y = (TYPE *)STARPU_GET_VECTOR_PTR(descr[1]);
			
 
				 	unsigned nx = STARPU_GET_VECTOR_NX(descr[0]);
			
 
				 
			
 
				 	unsigned *cnt = (unsigned *)STARPU_GET_VECTOR_PTR(descr[2]);
			
 
				+
			
 
				+	/* How many blocks do we use ? */ 
			
 
				+	unsigned nblocks = 128; // TODO
			
 
				+
			
 
				+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
			
 
				 	
			
 
				 	unsigned *per_block_cnt;
			
 
				-	cudaMalloc((void **)&per_block_cnt, (nx/BLOCK_SIZE)*sizeof(unsigned));
			
 
				+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned));
			
 
				+
			
 
				+	assert((nx % nblocks) == 0);
			
 
				 
			
 
				-	assert((nx % BLOCK_SIZE) == 0);
			
 
				+	/* How many threads per block ? At most 256, but no more threads than
			
 
				+	 * there are entries to process per block. */
			
 
				+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
			
 
				 
			
 
				 	/* each entry of per_block_cnt contains the number of successful shots
			
 
				 	 * in the corresponding block. */
			
 
				-	monte_carlo<<<nx/BLOCK_SIZE, BLOCK_SIZE>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				-	cudaThreadSynchronize();
			
 
				+	monte_carlo<<<nblocks, nthread_per_block>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				+
			
 
				+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				 
			
 
				 	/* compute the total number of successful shots by adding the elements
			
 
				 	 * of the per_block_cnt array */
			
 
				-	sum_per_block_cnt<<<1, 32>>>(nx/BLOCK_SIZE, per_block_cnt, cnt);
			
 
				-	cudaThreadSynchronize();
			
 
				+	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, cnt);
			
 
				+	cures = cudaThreadSynchronize();
			
 
				+	if (cures)
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	cudaFree(per_block_cnt);
			
 
				 }