Browse Source

make nshot_per_task a variable, add a performance model to pi_redux too. Reference the pi example from the documentation

Samuel Thibault 12 years ago
parent
commit
0c092d595b
5 changed files with 85 additions and 41 deletions
  1. 3 1
      doc/chapters/advanced-examples.texi
  2. 41 24
      examples/pi/pi.c
  3. 1 3
      examples/pi/pi.h
  4. 3 2
      examples/pi/pi_kernel.cu
  5. 37 11
      examples/pi/pi_redux.c

+ 3 - 1
doc/chapters/advanced-examples.texi

@@ -421,7 +421,9 @@ default. The @code{size_base} field of @code{struct starpu_perfmodel} however
 permits the application to override that, when for instance some of the data
 do not matter for task cost (e.g. mere reference table), or when using sparse
 structures (in which case it is the number of non-zeros which matter), or when
-there is some hidden parameter such as the number of iterations, etc.
+there is some hidden parameter such as the number of iterations, etc. The
+@code{examples/pi} examples uses this to include the number of iterations in the
+base.
 
 How to use schedulers which can benefit from such performance model is explained
 in @ref{Task scheduling policy}.

+ 41 - 24
examples/pi/pi.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -30,10 +30,12 @@ void cuda_kernel(void **descr, void *cl_arg);
 /* default value */
 static unsigned ntasks = 1024;
 
+static unsigned long long nshot_per_task = 16*1024*1024ULL;
+
 static void cpu_kernel(void *descr[], void *cl_arg)
 {
 	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned nx = NSHOT_PER_TASK;
+	unsigned nx = nshot_per_task;
 
 	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
 	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
@@ -64,7 +66,7 @@ static void cpu_kernel(void *descr[], void *cl_arg)
 /* The amount of work does not depend on the data size at all :) */
 static size_t size_base(struct starpu_task *task, unsigned nimpl)
 {
-	return NSHOT_PER_TASK;
+	return nshot_per_task;
 }
 
 static void parse_args(int argc, char **argv)
@@ -77,9 +79,42 @@ static void parse_args(int argc, char **argv)
 			char *argptr;
 			ntasks = strtol(argv[++i], &argptr, 10);
 		}
+
+		if (strcmp(argv[i], "-nshot") == 0)
+		{
+			char *argptr;
+			nshot_per_task = strtol(argv[++i], &argptr, 10);
+		}
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			fprintf(stderr,"Usage: %s [options...]\n", argv[0]);
+			fprintf(stderr,"\n");
+			fprintf(stderr,"Options:\n");
+			fprintf(stderr,"-ntasks <n>		select the number of tasks\n");
+			fprintf(stderr,"-nshot <n>		select the number of shot per task\n");
+			exit(0);
+		}
 	}
 }
 
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.size_base = size_base,
+	.symbol = "monte_carlo_pi"
+};
+
+static struct starpu_codelet pi_cl =
+{
+	.cpu_funcs = {cpu_kernel, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {cuda_kernel, NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W},
+	.model = &model
+};
+
 int main(int argc, char **argv)
 {
 	unsigned i;
@@ -120,24 +155,6 @@ int main(int argc, char **argv)
 	
 	starpu_data_partition(cnt_array_handle, &f);
 
-	static struct starpu_perfmodel model =
-	{
-		.type = STARPU_HISTORY_BASED,
-		.size_base = size_base,
-		.symbol = "monte_carlo_pi"
-	};
-
-	struct starpu_codelet cl =
-	{
-		.cpu_funcs = {cpu_kernel, NULL},
-#ifdef STARPU_USE_CUDA
-		.cuda_funcs = {cuda_kernel, NULL},
-#endif
-		.nbuffers = 2,
-		.modes = {STARPU_R, STARPU_W},
-		.model = &model
-	};
-
 	struct timeval start;
 	struct timeval end;
 
@@ -147,7 +164,7 @@ int main(int argc, char **argv)
 	{
 		struct starpu_task *task = starpu_task_create();
 
-		task->cl = &cl;
+		task->cl = &pi_cl;
 
 		STARPU_ASSERT(starpu_data_get_sub_data(cnt_array_handle, 1, i));
 
@@ -174,14 +191,14 @@ int main(int argc, char **argv)
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	unsigned long total_shot_cnt = ntasks * NSHOT_PER_TASK;
+	unsigned long total_shot_cnt = ntasks * nshot_per_task;
 
 	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
 	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(total_shot_cnt), total_cnt, total_shot_cnt);
 	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
 	FPRINTF(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
 
-	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&cl);
+	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&pi_cl);
 
 	starpu_shutdown();
 

+ 1 - 3
examples/pi/pi.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,8 +21,6 @@
 #include <starpu.h>
 #include <stdio.h>
 
-#define NSHOT_PER_TASK	(16*1024*1024ULL)
-
 #define TYPE	float
 
 /* extern "C" void cuda_kernel(void *descr[], void *cl_arg); */

+ 3 - 2
examples/pi/pi_kernel.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -102,7 +102,8 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 	cudaError_t cures;
 
 	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned nx = NSHOT_PER_TASK;
+	unsigned long long *nshot_per_task = (unsigned long long *) cl_arg;
+	unsigned nx = *nshot_per_task;
 
 	/* Generate Random numbers */
 	float *random_numbers;

+ 37 - 11
examples/pi/pi_redux.c

@@ -30,7 +30,7 @@
 #include <curand.h>
 #endif
 
-#define NSHOT_PER_TASK	(1024*1024)
+static unsigned long long nshot_per_task = 16*1024*1024ULL;
 
 /* default value */
 static unsigned long ntasks = 1024;
@@ -92,6 +92,12 @@ static void init_rng(void *arg __attribute__((unused)))
 	}
 }
 
+/* The amount of work does not depend on the data size at all :) */
+static size_t size_base(struct starpu_task *task, unsigned nimpl)
+{
+	return nshot_per_task;
+}
+
 static void parse_args(int argc, char **argv)
 {
 	int i;
@@ -103,6 +109,12 @@ static void parse_args(int argc, char **argv)
 			ntasks = strtol(argv[++i], &argptr, 10);
 		}
 
+		if (strcmp(argv[i], "-nshot") == 0)
+		{
+			char *argptr;
+			nshot_per_task = strtol(argv[++i], &argptr, 10);
+		}
+
 		if (strcmp(argv[i], "-noredux") == 0)
 		{
 			use_redux = 0;
@@ -114,7 +126,7 @@ static void parse_args(int argc, char **argv)
 			ntasks_warmup = 8; /* arbitrary number of warmup tasks */
 		}
 
-		if (strcmp(argv[i], "-h") == 0)
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
 			fprintf(stderr, "Usage: %s [-ntasks n] [-noredux] [-warmup] [-h]\n", argv[0]);
 			exit(-1);
@@ -139,8 +151,8 @@ static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
 	unsigned long local_cnt = 0;
 
 	/* Fill the scratchpad with random numbers */
-	int i;
-	for (i = 0; i < NSHOT_PER_TASK; i++)
+	unsigned i;
+	for (i = 0; i < nshot_per_task; i++)
 	{
 		double randx, randy;
 
@@ -176,17 +188,24 @@ static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 	/* Fill the scratchpad with random numbers. Note that both x and y
 	 * arrays are in stored the same vector. */
 	float *scratchpad_xy = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
-	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*NSHOT_PER_TASK);
+	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*nshot_per_task);
 	STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
 
 	float *x = &scratchpad_xy[0];
-	float *y = &scratchpad_xy[NSHOT_PER_TASK];
+	float *y = &scratchpad_xy[nshot_per_task];
 
 	unsigned long *shot_cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
-	pi_redux_cuda_kernel(x, y, NSHOT_PER_TASK, shot_cnt);
+	pi_redux_cuda_kernel(x, y, nshot_per_task, shot_cnt);
 }
 #endif
 
+static struct starpu_perfmodel pi_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.size_base = size_base,
+	.symbol = "monte_carlo_pi_scratch"
+};
+
 static struct starpu_codelet pi_cl =
 {
 	.cpu_funcs = {pi_func_cpu, NULL},
@@ -195,7 +214,14 @@ static struct starpu_codelet pi_cl =
 #endif
 	.nbuffers = 2,
 	.modes    = {STARPU_SCRATCH, STARPU_RW},
-	.model = NULL
+	.model = &pi_model
+};
+
+static struct starpu_perfmodel pi_model_redux =
+{
+	.type = STARPU_HISTORY_BASED,
+	.size_base = size_base,
+	.symbol = "monte_carlo_pi_scratch_redux"
 };
 
 static struct starpu_codelet pi_cl_redux =
@@ -206,7 +232,7 @@ static struct starpu_codelet pi_cl_redux =
 #endif
 	.nbuffers = 2,
 	.modes    = {STARPU_SCRATCH, STARPU_REDUX},
-	.model = NULL
+	.model = &pi_model_redux
 };
 
 /*
@@ -297,7 +323,7 @@ int main(int argc, char **argv)
 	/* Create a scratchpad data */
 	starpu_data_handle_t xy_scratchpad_handle;
 	starpu_vector_data_register(&xy_scratchpad_handle, -1, (uintptr_t)NULL,
-		2*NSHOT_PER_TASK, sizeof(float));
+		2*nshot_per_task, sizeof(float));
 
 	/* Create a variable that will be used to count the number of shots
 	 * that actually hit the unit circle when shooting randomly in
@@ -349,7 +375,7 @@ int main(int argc, char **argv)
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
 	 * probability to impact the disk: pi/4 */
-	unsigned long total = (ntasks + ntasks_warmup)*NSHOT_PER_TASK;
+	unsigned long total = (ntasks + ntasks_warmup)*nshot_per_task;
 	double pi_approx = ((double)shot_cnt*4.0)/total;
 
 	FPRINTF(stderr, "Reductions? %s\n", use_redux?"yes":"no");