浏览代码

bandwidth: add comparison with nop-based and sync-based sleeping

Samuel Thibault 5 年之前
父节点
当前提交
9fabb0f264
共有 2 个文件被更改,包括 146 次插入33 次删除
  1. 143 32
      tests/microbenchs/bandwidth.c
  2. 3 1
      tests/microbenchs/bandwidth_scheds.sh

+ 143 - 32
tests/microbenchs/bandwidth.c

@@ -37,22 +37,38 @@ static unsigned cpustep = 1;
 static unsigned noalone = 0;
 static unsigned iter = 30;
 static unsigned total_ncpus;
-static starpu_pthread_barrier_t barrier;
+static starpu_pthread_barrier_t barrier_begin, barrier_end;
 static float *result;
-static void **buffers;
+static void **buffers;	/* Indexed by logical core number */
+static char padding1[STARPU_CACHELINE_SIZE];
+static volatile char finished;
+static char padding2[STARPU_CACHELINE_SIZE];
 
+static unsigned interleave(unsigned i);
+
+/* Initialize the buffer locally */
+void initialize_buffer(void *foo)
+{
+	unsigned id = starpu_worker_get_id();
+#ifdef STARPU_HAVE_POSIX_MEMALIGN
+	int ret = posix_memalign(&buffers[id], getpagesize(), 2*size);
+	STARPU_ASSERT(ret == 0);
+#else
+	buffers[id] = malloc(2*size);
+#endif
+	memset(buffers[id], 0, 2*size);
+}
+
+/* Actual transfer codelet */
 void bw_func(void *descr[], void *arg)
 {
-	void *src = buffers[starpu_worker_get_id()];
+	int id = (uintptr_t) arg;
+	void *src = buffers[id];
 	void *dst = (void*) ((uintptr_t)src + size);
 	unsigned i;
 	double start, stop;
-	int ret;
 
-	memset(src, 0, size);
-	memset(dst, 0, size);
-
-	STARPU_PTHREAD_BARRIER_WAIT(&barrier);
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
 	start = starpu_timing_now();
 	for (i = 0; i < iter; i++)
 	{
@@ -60,9 +76,10 @@ void bw_func(void *descr[], void *arg)
 		STARPU_SYNCHRONIZE();
 	}
 	stop = starpu_timing_now();
-	STARPU_PTHREAD_BARRIER_WAIT(&barrier);
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_end);
+	finished = 1;
 
-	result[starpu_worker_get_id()] = (size*iter) / (stop - start);
+	result[id] = (size*iter) / (stop - start);
 }
 
 static struct starpu_codelet bw_codelet =
@@ -72,6 +89,44 @@ static struct starpu_codelet bw_codelet =
 	.nbuffers = 0,
 };
 
+/* Codelet that waits for completion while doing lots of cpu yields (nop). */
+void nop_func(void *descr[], void *arg)
+{
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	while (!finished)
+	{
+		unsigned i;
+		for (i = 0; i < 1000000; i++)
+			STARPU_UYIELD();
+		STARPU_SYNCHRONIZE();
+	}
+}
+
+static struct starpu_codelet nop_codelet =
+{
+	.cpu_funcs = {nop_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+/* Codelet that waits for completion while aggressively reading the finished variable. */
+void sync_func(void *descr[], void *arg)
+{
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	while (!finished)
+	{
+		STARPU_VALGRIND_YIELD();
+		STARPU_SYNCHRONIZE();
+	}
+}
+
+static struct starpu_codelet sync_codelet =
+{
+	.cpu_funcs = {sync_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
 static void usage(char **argv)
 {
 	fprintf(stderr, "Usage: %s [-n iter] [-s size (MB)] [-i increment] [-a]\n", argv[0]);
@@ -115,7 +170,14 @@ static unsigned interleave(unsigned i)
 		return 0;
 }
 
-static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl)
+enum sleep_type {
+	PAUSE,
+	NOP,
+	SYNC,
+	SCHED,
+};
+
+static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl, enum sleep_type sleep)
 {
 	int ret;
 	unsigned i;
@@ -130,7 +192,7 @@ static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int
 	conf.nmpi_ms = 0;
 	conf.ncpus = ncpus;
 
-	if (intl && nbusy == ncpus)
+	if (intl && sleep == PAUSE)
 	{
 		conf.use_explicit_workers_bindid = 1;
 		for (i = 0; i < ncpus; i++)
@@ -141,27 +203,72 @@ static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	STARPU_PTHREAD_BARRIER_INIT(&barrier, NULL, nbusy);
+	if (sleep == PAUSE || sleep == SCHED)
+		/* In these cases we don't have a task on each cpu */
+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, nbusy);
+	else
+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, ncpus);
+
+	STARPU_PTHREAD_BARRIER_INIT(&barrier_end, NULL, nbusy);
+
+	finished = 0;
+	for (i = 0; i < ncpus; i++)
+		result[i] = NAN;
 
 	for (i = 0; i < nbusy; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
 		task->cl = &bw_codelet;
+
+		if (intl)
+			task->cl_arg = (void*) (uintptr_t) interleave(i);
+		else
+			task->cl_arg = (void*) (uintptr_t) i;
+
 		task->execute_on_a_specific_worker = 1;
-		if (intl && nbusy != ncpus)
+		if (intl && sleep != PAUSE) /* In the pause case we interleaved above */
 			task->workerid = interleave(i);
 		else
 			task->workerid = i;
+
 		ret = starpu_task_submit(task);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
+	if (sleep != PAUSE && sleep != SCHED)
+	{
+		/* Add waiting tasks */
+		for ( ; i < ncpus; i++)
+		{
+			struct starpu_task *task = starpu_task_create();
+			switch (sleep)
+			{
+			case NOP:
+				task->cl = &nop_codelet;
+				break;
+			case SYNC:
+				task->cl = &sync_codelet;
+				break;
+			default:
+				STARPU_ASSERT(0);
+			}
+			task->execute_on_a_specific_worker = 1;
+			task->workerid = interleave(i);
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+	}
+
+
 	starpu_task_wait_for_all();
 	starpu_shutdown();
 
 	for (bw = 0., i = 0; i < nbusy; i++)
 	{
-		bw += result[i];
+		if (intl)
+			bw += result[interleave(i)];
+		else
+			bw += result[i];
 	}
 	return bw;
 }
@@ -171,7 +278,7 @@ int main(int argc, char **argv)
 	int ret;
 	unsigned n;
 	struct starpu_conf conf;
-	float alone, alone_int, idle, idle_int;
+	float alone, alone_int, alone_int_nop, alone_int_sync, sched, sched_int;
 
 	parse_args(argc, argv);
 
@@ -186,39 +293,43 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	total_ncpus = starpu_cpu_worker_get_count();
+
+	buffers = malloc(total_ncpus * sizeof(*buffers));
+	starpu_execute_on_each_worker_ex(initialize_buffer, NULL, STARPU_CPU, "init_buffer");
 	starpu_shutdown();
 
 	if (total_ncpus == 0)
 		return STARPU_TEST_SKIPPED;
 
 	result = malloc(total_ncpus * sizeof(result[0]));
-	buffers = malloc(total_ncpus * sizeof(*buffers));
-	for (n = 0; n < total_ncpus; n++)
-	{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-		ret = posix_memalign(&buffers[n], getpagesize(), 2*size);
-		STARPU_ASSERT(ret == 0);
-#else
-		buffers[n] = malloc(2*size);
-#endif
-	}
 
-	printf("# nw\talone\t\t+idle\t\tefficiency\talone int.l\t+idle int.l\tefficiency\n");
+	printf("# nw\ta comp.\t+sched\teff%%\ta scat.\t+nop\t+sync\t+sched\teff%% vs nop\n");
 	for (n = cpustep; n <= total_ncpus; n += cpustep)
 	{
 		if (noalone)
 		{
 			alone = 0.;
 			alone_int = 0.;
+			alone_int_nop = 0.;
+			alone_int_sync = 0.;
 		}
 		else
 		{
-			alone = bench(&argc, &argv, n, n, 0);
-			alone_int = bench(&argc, &argv, n, n, 1);
+			alone = bench(&argc, &argv, n, n, 0, PAUSE);
+			alone_int = bench(&argc, &argv, n, n, 1, PAUSE);
+			alone_int_nop = bench(&argc, &argv, n, total_ncpus, 1, NOP);
+			alone_int_sync = bench(&argc, &argv, n, total_ncpus, 1, SYNC);
 		}
-		idle = bench(&argc, &argv, n, total_ncpus, 0);
-		idle_int = bench(&argc, &argv, n, total_ncpus, 1);
-		printf("%d\t%f\t%f\t%f\t%f\t%f\t%f\n", n, alone/1000, idle/1000, idle*100/alone, alone_int/1000, idle_int/1000, idle_int*100/alone_int);
+		sched = bench(&argc, &argv, n, total_ncpus, 0, SCHED);
+		sched_int = bench(&argc, &argv, n, total_ncpus, 1, SCHED);
+		printf("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
+				n,
+				alone/1000,
+				sched/1000, sched*100/alone,
+				alone_int/1000,
+				alone_int_nop/1000,
+				alone_int_sync/1000,
+				sched_int/1000, sched_int*100/alone_int_nop);
 		fflush(stdout);
 	}
 

+ 3 - 1
tests/microbenchs/bandwidth_scheds.sh

@@ -46,6 +46,8 @@ set xlabel "ncores"
 
 plot \\
 	"bandwidth-$DEFAULT.dat" using 1:2 with lines title "alone", \\
+	"bandwidth-$DEFAULT.dat" using 1:6 with lines title "nop", \\
+	"bandwidth-$DEFAULT.dat" using 1:7 with lines title "sync", \\
 	"bandwidth-$DEFAULT.dat" using 1:5 with lines title "alone interleave", \\
 EOF
 
@@ -60,7 +62,7 @@ do
 
 	STARPU_BACKOFF_MIN=0 STARPU_BACKOFF_MAX=0 STARPU_SCHED=$sched $STARPU_LAUNCH $(dirname $0)/bandwidth $extra | tee bandwidth-$sched.dat
 	echo "\"bandwidth-$sched.dat\" using 1:3 with linespoints lt $type pt $type title \"$sched\", \\" >> bandwidth.gp
-	echo "\"bandwidth-$sched.dat\" using 1:6 with linespoints lt $type pt $type notitle, \\" >> bandwidth.gp
+	echo "\"bandwidth-$sched.dat\" using 1:8 with linespoints lt $type pt $type notitle, \\" >> bandwidth.gp
 	type=$((type+1))
 done