|
@@ -0,0 +1,342 @@
|
|
|
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
+ *
|
|
|
+ * Copyright (C) 2009-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
|
|
|
+ *
|
|
|
+ * StarPU is free software; you can redistribute it and/or modify
|
|
|
+ * it under the terms of the GNU Lesser General Public License as published by
|
|
|
+ * the Free Software Foundation; either version 2.1 of the License, or (at
|
|
|
+ * your option) any later version.
|
|
|
+ *
|
|
|
+ * StarPU is distributed in the hope that it will be useful, but
|
|
|
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
+ *
|
|
|
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
|
|
|
+ */
|
|
|
+
|
|
|
+#include <stdio.h>
|
|
|
+#include <unistd.h>
|
|
|
+
|
|
|
+#include <starpu.h>
|
|
|
+#include "../helper.h"
|
|
|
+
|
|
|
+/*
|
|
|
+ * Measure the memory bandwidth available to kernels depending on the number of
|
|
|
+ * kernels and number of idle workers.
|
|
|
+ */
|
|
|
+
|
|
|
+#ifdef STARPU_QUICK_CHECK
|
|
|
+static size_t size = 1024;
|
|
|
+static unsigned cpustep = 4;
|
|
|
+#else
|
|
|
+/* Must be bigger than available cache size per core, 64MiB should be enough */
|
|
|
+static size_t size = 64UL << 20;
|
|
|
+static unsigned cpustep = 1;
|
|
|
+#endif
|
|
|
+
|
|
|
+static unsigned noalone = 0;
|
|
|
+static unsigned iter = 30;
|
|
|
+static unsigned total_ncpus;
|
|
|
+static starpu_pthread_barrier_t barrier_begin, barrier_end;
|
|
|
+static float *result;
|
|
|
+static void **buffers; /* Indexed by logical core number */
|
|
|
+static char padding1[STARPU_CACHELINE_SIZE];
|
|
|
+static volatile char finished;
|
|
|
+static char padding2[STARPU_CACHELINE_SIZE];
|
|
|
+
|
|
|
+static unsigned interleave(unsigned i);
|
|
|
+
|
|
|
+/* Initialize the buffer locally */
|
|
|
+void initialize_buffer(void *foo)
|
|
|
+{
|
|
|
+ unsigned id = starpu_worker_get_id();
|
|
|
+#ifdef STARPU_HAVE_POSIX_MEMALIGN
|
|
|
+ int ret = posix_memalign(&buffers[id], getpagesize(), 2*size);
|
|
|
+ STARPU_ASSERT(ret == 0);
|
|
|
+#else
|
|
|
+ buffers[id] = malloc(2*size);
|
|
|
+#endif
|
|
|
+ memset(buffers[id], 0, 2*size);
|
|
|
+}
|
|
|
+
|
|
|
+/* Actual transfer codelet */
|
|
|
+void bw_func(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ int id = (uintptr_t) arg;
|
|
|
+ void *src = buffers[id];
|
|
|
+ void *dst = (void*) ((uintptr_t)src + size);
|
|
|
+ unsigned i;
|
|
|
+ double start, stop;
|
|
|
+
|
|
|
+ STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
|
|
|
+ start = starpu_timing_now();
|
|
|
+ for (i = 0; i < iter; i++)
|
|
|
+ {
|
|
|
+ memcpy(dst, src, size);
|
|
|
+ STARPU_SYNCHRONIZE();
|
|
|
+ }
|
|
|
+ stop = starpu_timing_now();
|
|
|
+ STARPU_PTHREAD_BARRIER_WAIT(&barrier_end);
|
|
|
+ finished = 1;
|
|
|
+
|
|
|
+ result[id] = (size*iter) / (stop - start);
|
|
|
+}
|
|
|
+
|
|
|
+static struct starpu_codelet bw_codelet =
|
|
|
+{
|
|
|
+ .cpu_funcs = {bw_func},
|
|
|
+ .model = NULL,
|
|
|
+ .nbuffers = 0,
|
|
|
+};
|
|
|
+
|
|
|
+/* Codelet that waits for completion while doing lots of cpu yields (nop). */
|
|
|
+void nop_func(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
|
|
|
+ while (!finished)
|
|
|
+ {
|
|
|
+ unsigned i;
|
|
|
+ for (i = 0; i < 1000000; i++)
|
|
|
+ STARPU_UYIELD();
|
|
|
+ STARPU_SYNCHRONIZE();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static struct starpu_codelet nop_codelet =
|
|
|
+{
|
|
|
+ .cpu_funcs = {nop_func},
|
|
|
+ .model = NULL,
|
|
|
+ .nbuffers = 0,
|
|
|
+};
|
|
|
+
|
|
|
+/* Codelet that waits for completion while aggressively reading the finished variable. */
|
|
|
+void sync_func(void *descr[], void *arg)
|
|
|
+{
|
|
|
+ STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
|
|
|
+ while (!finished)
|
|
|
+ {
|
|
|
+ STARPU_VALGRIND_YIELD();
|
|
|
+ STARPU_SYNCHRONIZE();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static struct starpu_codelet sync_codelet =
|
|
|
+{
|
|
|
+ .cpu_funcs = {sync_func},
|
|
|
+ .model = NULL,
|
|
|
+ .nbuffers = 0,
|
|
|
+};
|
|
|
+
|
|
|
+static void usage(char **argv)
|
|
|
+{
|
|
|
+ fprintf(stderr, "Usage: %s [-n niter] [-s size (MB)] [-c cpustep] [-a]\n", argv[0]);
|
|
|
+ fprintf(stderr, "\t-n niter\tNumber of iterations\n");
|
|
|
+ fprintf(stderr, "\t-s size\tBuffer size in MB\n");
|
|
|
+ fprintf(stderr, "\t-c cpustep\tCpu number increment\n");
|
|
|
+ fprintf(stderr, "\t-a Do not run the alone test\n");
|
|
|
+ exit(EXIT_FAILURE);
|
|
|
+}
|
|
|
+
|
|
|
+static void parse_args(int argc, char **argv)
|
|
|
+{
|
|
|
+ int c;
|
|
|
+ while ((c = getopt(argc, argv, "n:s:c:ah")) != -1)
|
|
|
+ switch(c)
|
|
|
+ {
|
|
|
+ case 'n':
|
|
|
+ iter = atoi(optarg);
|
|
|
+ break;
|
|
|
+ case 's':
|
|
|
+ size = (long)atoi(optarg) << 20;
|
|
|
+ break;
|
|
|
+ case 'c':
|
|
|
+ cpustep = atoi(optarg);
|
|
|
+ break;
|
|
|
+ case 'a':
|
|
|
+ noalone = 1;
|
|
|
+ break;
|
|
|
+ case 'h':
|
|
|
+ usage(argv);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static unsigned interleave(unsigned i)
|
|
|
+{
|
|
|
+ /* TODO: rather distribute over hierarchy */
|
|
|
+ if (total_ncpus > 1)
|
|
|
+ return (i % (total_ncpus/2))*2 + i / (total_ncpus/2);
|
|
|
+ else
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+enum sleep_type {
|
|
|
+ PAUSE,
|
|
|
+ NOP,
|
|
|
+ SYNC,
|
|
|
+ SCHED,
|
|
|
+};
|
|
|
+
|
|
|
+static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl, enum sleep_type sleep)
|
|
|
+{
|
|
|
+ int ret;
|
|
|
+ unsigned i;
|
|
|
+ struct starpu_conf conf;
|
|
|
+ float bw;
|
|
|
+
|
|
|
+ starpu_conf_init(&conf);
|
|
|
+ conf.precedence_over_environment_variables = 1;
|
|
|
+ conf.ncuda = 0;
|
|
|
+ conf.nopencl = 0;
|
|
|
+ conf.nmic = 0;
|
|
|
+ conf.nmpi_ms = 0;
|
|
|
+ conf.ncpus = ncpus;
|
|
|
+
|
|
|
+ if (intl && sleep == PAUSE)
|
|
|
+ {
|
|
|
+ conf.use_explicit_workers_bindid = 1;
|
|
|
+ for (i = 0; i < ncpus; i++)
|
|
|
+ conf.workers_bindid[i] = interleave(i);
|
|
|
+ }
|
|
|
+
|
|
|
+ ret = starpu_initialize(&conf, argc, argv);
|
|
|
+ if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
|
|
|
+
|
|
|
+ if (sleep == PAUSE || sleep == SCHED)
|
|
|
+ /* In these cases we don't have a task on each cpu */
|
|
|
+ STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, nbusy);
|
|
|
+ else
|
|
|
+ STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, ncpus);
|
|
|
+
|
|
|
+ STARPU_PTHREAD_BARRIER_INIT(&barrier_end, NULL, nbusy);
|
|
|
+
|
|
|
+ finished = 0;
|
|
|
+ for (i = 0; i < ncpus; i++)
|
|
|
+ result[i] = NAN;
|
|
|
+
|
|
|
+ for (i = 0; i < nbusy; i++)
|
|
|
+ {
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+ task->cl = &bw_codelet;
|
|
|
+
|
|
|
+ if (intl)
|
|
|
+ task->cl_arg = (void*) (uintptr_t) interleave(i);
|
|
|
+ else
|
|
|
+ task->cl_arg = (void*) (uintptr_t) i;
|
|
|
+
|
|
|
+ task->execute_on_a_specific_worker = 1;
|
|
|
+ if (intl && sleep != PAUSE) /* In the pause case we interleaved above */
|
|
|
+ task->workerid = interleave(i);
|
|
|
+ else
|
|
|
+ task->workerid = i;
|
|
|
+
|
|
|
+ ret = starpu_task_submit(task);
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
|
|
|
+ }
|
|
|
+
|
|
|
+ if (sleep != PAUSE && sleep != SCHED)
|
|
|
+ {
|
|
|
+ /* Add waiting tasks */
|
|
|
+ for ( ; i < ncpus; i++)
|
|
|
+ {
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+ switch (sleep)
|
|
|
+ {
|
|
|
+ case NOP:
|
|
|
+ task->cl = &nop_codelet;
|
|
|
+ break;
|
|
|
+ case SYNC:
|
|
|
+ task->cl = &sync_codelet;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ STARPU_ASSERT(0);
|
|
|
+ }
|
|
|
+ task->execute_on_a_specific_worker = 1;
|
|
|
+ task->workerid = interleave(i);
|
|
|
+ ret = starpu_task_submit(task);
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ starpu_task_wait_for_all();
|
|
|
+ starpu_shutdown();
|
|
|
+
|
|
|
+ for (bw = 0., i = 0; i < nbusy; i++)
|
|
|
+ {
|
|
|
+ if (intl)
|
|
|
+ bw += result[interleave(i)];
|
|
|
+ else
|
|
|
+ bw += result[i];
|
|
|
+ }
|
|
|
+ return bw;
|
|
|
+}
|
|
|
+
|
|
|
+int main(int argc, char **argv)
|
|
|
+{
|
|
|
+ int ret;
|
|
|
+ unsigned n;
|
|
|
+ struct starpu_conf conf;
|
|
|
+ float alone, alone_int, alone_int_nop, alone_int_sync, sched, sched_int;
|
|
|
+
|
|
|
+ parse_args(argc, argv);
|
|
|
+
|
|
|
+ starpu_conf_init(&conf);
|
|
|
+ conf.precedence_over_environment_variables = 1;
|
|
|
+ conf.ncuda = 0;
|
|
|
+ conf.nopencl = 0;
|
|
|
+ conf.nmic = 0;
|
|
|
+ conf.nmpi_ms = 0;
|
|
|
+
|
|
|
+ ret = starpu_initialize(&conf, &argc, &argv);
|
|
|
+ if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
|
|
|
+ total_ncpus = starpu_cpu_worker_get_count();
|
|
|
+
|
|
|
+ buffers = malloc(total_ncpus * sizeof(*buffers));
|
|
|
+ starpu_execute_on_each_worker_ex(initialize_buffer, NULL, STARPU_CPU, "init_buffer");
|
|
|
+ starpu_shutdown();
|
|
|
+
|
|
|
+ if (total_ncpus == 0)
|
|
|
+ return STARPU_TEST_SKIPPED;
|
|
|
+
|
|
|
+ result = malloc(total_ncpus * sizeof(result[0]));
|
|
|
+
|
|
|
+ printf("# nw\ta comp.\t+sched\teff%%\ta scat.\t+nop\t+sync\t+sched\teff%% vs nop\n");
|
|
|
+ for (n = cpustep; n <= total_ncpus; n += cpustep)
|
|
|
+ {
|
|
|
+ if (noalone)
|
|
|
+ {
|
|
|
+ alone = 0.;
|
|
|
+ alone_int = 0.;
|
|
|
+ alone_int_nop = 0.;
|
|
|
+ alone_int_sync = 0.;
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ alone = bench(&argc, &argv, n, n, 0, PAUSE);
|
|
|
+ alone_int = bench(&argc, &argv, n, n, 1, PAUSE);
|
|
|
+ alone_int_nop = bench(&argc, &argv, n, total_ncpus, 1, NOP);
|
|
|
+ alone_int_sync = bench(&argc, &argv, n, total_ncpus, 1, SYNC);
|
|
|
+ }
|
|
|
+ sched = bench(&argc, &argv, n, total_ncpus, 0, SCHED);
|
|
|
+ sched_int = bench(&argc, &argv, n, total_ncpus, 1, SCHED);
|
|
|
+ printf("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
|
|
|
+ n,
|
|
|
+ alone/1000,
|
|
|
+ sched/1000, sched*100/alone,
|
|
|
+ alone_int/1000,
|
|
|
+ alone_int_nop/1000,
|
|
|
+ alone_int_sync/1000,
|
|
|
+ sched_int/1000, sched_int*100/alone_int_nop);
|
|
|
+ fflush(stdout);
|
|
|
+ }
|
|
|
+
|
|
|
+ free(result);
|
|
|
+
|
|
|
+ for (n = 0; n < total_ncpus; n++)
|
|
|
+ free(buffers[n]);
|
|
|
+
|
|
|
+ return EXIT_SUCCESS;
|
|
|
+}
|