|
@@ -37,22 +37,38 @@ static unsigned cpustep = 1;
|
|
|
static unsigned noalone = 0;
|
|
|
static unsigned iter = 30;
|
|
|
static unsigned total_ncpus;
|
|
|
-static starpu_pthread_barrier_t barrier;
|
|
|
+static starpu_pthread_barrier_t barrier_begin, barrier_end;
|
|
|
static float *result;
|
|
|
-static void **buffers;
|
|
|
+static void **buffers; /* Indexed by logical core number */
|
|
|
+static char padding1[STARPU_CACHELINE_SIZE];
|
|
|
+static volatile char finished;
|
|
|
+static char padding2[STARPU_CACHELINE_SIZE];
|
|
|
|
|
|
+static unsigned interleave(unsigned i);
|
|
|
+
|
|
|
+/* Initialize the buffer locally */
|
|
|
+void initialize_buffer(void *foo)
|
|
|
+{
|
|
|
+ unsigned id = starpu_worker_get_id();
|
|
|
+#ifdef STARPU_HAVE_POSIX_MEMALIGN
|
|
|
+ int ret = posix_memalign(&buffers[id], getpagesize(), 2*size);
|
|
|
+ STARPU_ASSERT(ret == 0);
|
|
|
+#else
|
|
|
+ buffers[id] = malloc(2*size);
|
|
|
+#endif
|
|
|
+ memset(buffers[id], 0, 2*size);
|
|
|
+}
|
|
|
+
|
|
|
+/* Actual transfer codelet */
|
|
|
void bw_func(void *descr[], void *arg)
|
|
|
{
|
|
|
- void *src = buffers[starpu_worker_get_id()];
|
|
|
+ int id = (uintptr_t) arg;
|
|
|
+ void *src = buffers[id];
|
|
|
void *dst = (void*) ((uintptr_t)src + size);
|
|
|
unsigned i;
|
|
|
double start, stop;
|
|
|
- int ret;
|
|
|
|
|
|
- memset(src, 0, size);
|
|
|
- memset(dst, 0, size);
|
|
|
-
|
|
|
- STARPU_PTHREAD_BARRIER_WAIT(&barrier);
|
|
|
+ STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
|
|
|
start = starpu_timing_now();
|
|
|
for (i = 0; i < iter; i++)
|
|
|
{
|
|
@@ -60,9 +76,10 @@ void bw_func(void *descr[], void *arg)
|
|
|
STARPU_SYNCHRONIZE();
|
|
|
}
|
|
|
stop = starpu_timing_now();
|
|
|
- STARPU_PTHREAD_BARRIER_WAIT(&barrier);
|
|
|
+ STARPU_PTHREAD_BARRIER_WAIT(&barrier_end);
|
|
|
+ finished = 1;
|
|
|
|
|
|
- result[starpu_worker_get_id()] = (size*iter) / (stop - start);
|
|
|
+ result[id] = (size*iter) / (stop - start);
|
|
|
}
|
|
|
|
|
|
static struct starpu_codelet bw_codelet =
|
|
@@ -72,6 +89,44 @@ static struct starpu_codelet bw_codelet =
|
|
|
.nbuffers = 0,
|
|
|
};
|
|
|
|
|
|
+/* Codelet that waits for completion while doing lots of cpu yields (nop). */
|
|
|
+void nop_func(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
|
|
|
+ while (!finished)
|
|
|
+ {
|
|
|
+ unsigned i;
|
|
|
+ for (i = 0; i < 1000000; i++)
|
|
|
+ STARPU_UYIELD();
|
|
|
+ STARPU_SYNCHRONIZE();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static struct starpu_codelet nop_codelet =
|
|
|
+{
|
|
|
+ .cpu_funcs = {nop_func},
|
|
|
+ .model = NULL,
|
|
|
+ .nbuffers = 0,
|
|
|
+};
|
|
|
+
|
|
|
+/* Codelet that waits for completion while aggressively reading the finished variable. */
|
|
|
+void sync_func(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
|
|
|
+ while (!finished)
|
|
|
+ {
|
|
|
+ STARPU_VALGRIND_YIELD();
|
|
|
+ STARPU_SYNCHRONIZE();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static struct starpu_codelet sync_codelet =
|
|
|
+{
|
|
|
+ .cpu_funcs = {sync_func},
|
|
|
+ .model = NULL,
|
|
|
+ .nbuffers = 0,
|
|
|
+};
|
|
|
+
|
|
|
static void usage(char **argv)
|
|
|
{
|
|
|
fprintf(stderr, "Usage: %s [-n iter] [-s size (MB)] [-i increment] [-a]\n", argv[0]);
|
|
@@ -115,7 +170,14 @@ static unsigned interleave(unsigned i)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl)
|
|
|
+enum sleep_type {
|
|
|
+ PAUSE,
|
|
|
+ NOP,
|
|
|
+ SYNC,
|
|
|
+ SCHED,
|
|
|
+};
|
|
|
+
|
|
|
+static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl, enum sleep_type sleep)
|
|
|
{
|
|
|
int ret;
|
|
|
unsigned i;
|
|
@@ -130,7 +192,7 @@ static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int
|
|
|
conf.nmpi_ms = 0;
|
|
|
conf.ncpus = ncpus;
|
|
|
|
|
|
- if (intl && nbusy == ncpus)
|
|
|
+ if (intl && sleep == PAUSE)
|
|
|
{
|
|
|
conf.use_explicit_workers_bindid = 1;
|
|
|
for (i = 0; i < ncpus; i++)
|
|
@@ -141,27 +203,72 @@ static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int
|
|
|
if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
|
|
|
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
|
|
|
|
|
|
- STARPU_PTHREAD_BARRIER_INIT(&barrier, NULL, nbusy);
|
|
|
+ if (sleep == PAUSE || sleep == SCHED)
|
|
|
+ /* In these cases we don't have a task on each cpu */
|
|
|
+ STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, nbusy);
|
|
|
+ else
|
|
|
+ STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, ncpus);
|
|
|
+
|
|
|
+ STARPU_PTHREAD_BARRIER_INIT(&barrier_end, NULL, nbusy);
|
|
|
+
|
|
|
+ finished = 0;
|
|
|
+ for (i = 0; i < ncpus; i++)
|
|
|
+ result[i] = NAN;
|
|
|
|
|
|
for (i = 0; i < nbusy; i++)
|
|
|
{
|
|
|
struct starpu_task *task = starpu_task_create();
|
|
|
task->cl = &bw_codelet;
|
|
|
+
|
|
|
+ if (intl)
|
|
|
+ task->cl_arg = (void*) (uintptr_t) interleave(i);
|
|
|
+ else
|
|
|
+ task->cl_arg = (void*) (uintptr_t) i;
|
|
|
+
|
|
|
task->execute_on_a_specific_worker = 1;
|
|
|
- if (intl && nbusy != ncpus)
|
|
|
+ if (intl && sleep != PAUSE) /* In the pause case we interleaved above */
|
|
|
task->workerid = interleave(i);
|
|
|
else
|
|
|
task->workerid = i;
|
|
|
+
|
|
|
ret = starpu_task_submit(task);
|
|
|
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
|
|
|
}
|
|
|
|
|
|
+ if (sleep != PAUSE && sleep != SCHED)
|
|
|
+ {
|
|
|
+ /* Add waiting tasks */
|
|
|
+ for ( ; i < ncpus; i++)
|
|
|
+ {
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+ switch (sleep)
|
|
|
+ {
|
|
|
+ case NOP:
|
|
|
+ task->cl = &nop_codelet;
|
|
|
+ break;
|
|
|
+ case SYNC:
|
|
|
+ task->cl = &sync_codelet;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ STARPU_ASSERT(0);
|
|
|
+ }
|
|
|
+ task->execute_on_a_specific_worker = 1;
|
|
|
+ task->workerid = interleave(i);
|
|
|
+ ret = starpu_task_submit(task);
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
starpu_task_wait_for_all();
|
|
|
starpu_shutdown();
|
|
|
|
|
|
for (bw = 0., i = 0; i < nbusy; i++)
|
|
|
{
|
|
|
- bw += result[i];
|
|
|
+ if (intl)
|
|
|
+ bw += result[interleave(i)];
|
|
|
+ else
|
|
|
+ bw += result[i];
|
|
|
}
|
|
|
return bw;
|
|
|
}
|
|
@@ -171,7 +278,7 @@ int main(int argc, char **argv)
|
|
|
int ret;
|
|
|
unsigned n;
|
|
|
struct starpu_conf conf;
|
|
|
- float alone, alone_int, idle, idle_int;
|
|
|
+ float alone, alone_int, alone_int_nop, alone_int_sync, sched, sched_int;
|
|
|
|
|
|
parse_args(argc, argv);
|
|
|
|
|
@@ -186,39 +293,43 @@ int main(int argc, char **argv)
|
|
|
if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
|
|
|
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
|
|
|
total_ncpus = starpu_cpu_worker_get_count();
|
|
|
+
|
|
|
+ buffers = malloc(total_ncpus * sizeof(*buffers));
|
|
|
+ starpu_execute_on_each_worker_ex(initialize_buffer, NULL, STARPU_CPU, "init_buffer");
|
|
|
starpu_shutdown();
|
|
|
|
|
|
if (total_ncpus == 0)
|
|
|
return STARPU_TEST_SKIPPED;
|
|
|
|
|
|
result = malloc(total_ncpus * sizeof(result[0]));
|
|
|
- buffers = malloc(total_ncpus * sizeof(*buffers));
|
|
|
- for (n = 0; n < total_ncpus; n++)
|
|
|
- {
|
|
|
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
|
|
|
- ret = posix_memalign(&buffers[n], getpagesize(), 2*size);
|
|
|
- STARPU_ASSERT(ret == 0);
|
|
|
-#else
|
|
|
- buffers[n] = malloc(2*size);
|
|
|
-#endif
|
|
|
- }
|
|
|
|
|
|
- printf("# nw\talone\t\t+idle\t\tefficiency\talone int.l\t+idle int.l\tefficiency\n");
|
|
|
+ printf("# nw\ta comp.\t+sched\teff%%\ta scat.\t+nop\t+sync\t+sched\teff%% vs nop\n");
|
|
|
for (n = cpustep; n <= total_ncpus; n += cpustep)
|
|
|
{
|
|
|
if (noalone)
|
|
|
{
|
|
|
alone = 0.;
|
|
|
alone_int = 0.;
|
|
|
+ alone_int_nop = 0.;
|
|
|
+ alone_int_sync = 0.;
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
- alone = bench(&argc, &argv, n, n, 0);
|
|
|
- alone_int = bench(&argc, &argv, n, n, 1);
|
|
|
+ alone = bench(&argc, &argv, n, n, 0, PAUSE);
|
|
|
+ alone_int = bench(&argc, &argv, n, n, 1, PAUSE);
|
|
|
+ alone_int_nop = bench(&argc, &argv, n, total_ncpus, 1, NOP);
|
|
|
+ alone_int_sync = bench(&argc, &argv, n, total_ncpus, 1, SYNC);
|
|
|
}
|
|
|
- idle = bench(&argc, &argv, n, total_ncpus, 0);
|
|
|
- idle_int = bench(&argc, &argv, n, total_ncpus, 1);
|
|
|
- printf("%d\t%f\t%f\t%f\t%f\t%f\t%f\n", n, alone/1000, idle/1000, idle*100/alone, alone_int/1000, idle_int/1000, idle_int*100/alone_int);
|
|
|
+ sched = bench(&argc, &argv, n, total_ncpus, 0, SCHED);
|
|
|
+ sched_int = bench(&argc, &argv, n, total_ncpus, 1, SCHED);
|
|
|
+ printf("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
|
|
|
+ n,
|
|
|
+ alone/1000,
|
|
|
+ sched/1000, sched*100/alone,
|
|
|
+ alone_int/1000,
|
|
|
+ alone_int_nop/1000,
|
|
|
+ alone_int_sync/1000,
|
|
|
+ sched_int/1000, sched_int*100/alone_int_nop);
|
|
|
fflush(stdout);
|
|
|
}
|
|
|
|