|
@@ -0,0 +1,240 @@
|
|
|
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
+ *
|
|
|
+ * Copyright (C) 2009-2015 Université de Bordeaux
|
|
|
+ * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
|
|
|
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015 CNRS
|
|
|
+ *
|
|
|
+ * StarPU is free software; you can redistribute it and/or modify
|
|
|
+ * it under the terms of the GNU Lesser General Public License as published by
|
|
|
+ * the Free Software Foundation; either version 2.1 of the License, or (at
|
|
|
+ * your option) any later version.
|
|
|
+ *
|
|
|
+ * StarPU is distributed in the hope that it will be useful, but
|
|
|
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
+ *
|
|
|
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
|
|
|
+ */
|
|
|
+
|
|
|
+/*
|
|
|
+ * This creates two dumb vectors, splits them into chunks, and for each pair of
|
|
|
+ * chunk, run axpy on them.
|
|
|
+ */
|
|
|
+
|
|
|
+#include <starpu.h>
|
|
|
+#include <stdlib.h>
|
|
|
+#include <stdio.h>
|
|
|
+#include <assert.h>
|
|
|
+#include <math.h>
|
|
|
+
|
|
|
+#include <common/blas.h>
|
|
|
+
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+#include <cublas.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+
|
|
|
+#define N 512*512
|
|
|
+#define NITER 100
|
|
|
+
|
|
|
+
|
|
|
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
|
|
|
+
|
|
|
+#define EPSILON 1e-6
|
|
|
+
|
|
|
+float *_vec_x[NITER], *_vec_y[NITER];
|
|
|
+float _alpha = 3.41;
|
|
|
+
|
|
|
+/* descriptors for StarPU */
|
|
|
+starpu_data_handle_t _handle_y[NITER], _handle_x[NITER];
|
|
|
+
|
|
|
+void axpy_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
|
|
|
+{
|
|
|
+ float alpha = *((float *)arg);
|
|
|
+
|
|
|
+ unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
|
|
|
+
|
|
|
+ float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
|
|
|
+ float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
|
|
|
+
|
|
|
+ unsigned i;
|
|
|
+ for( i = 0; i < n; i++)
|
|
|
+ block_y[i] = alpha * block_x[i] + block_y[i];
|
|
|
+}
|
|
|
+
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+extern void cuda_axpy(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
|
|
|
+#endif
|
|
|
+
|
|
|
+static struct starpu_perfmodel axpy_model =
|
|
|
+{
|
|
|
+ .type = STARPU_HISTORY_BASED,
|
|
|
+ .symbol = "axpy"
|
|
|
+};
|
|
|
+
|
|
|
+static struct starpu_codelet axpy_cl =
|
|
|
+{
|
|
|
+ /* .cpu_funcs = {axpy_cpu}, */
|
|
|
+ /* .cpu_funcs_name = {"axpy_cpu"}, */
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ .cuda_funcs = {cuda_axpy},
|
|
|
+#elif defined(STARPU_SIMGRID)
|
|
|
+ .cuda_funcs = {(void*)1},
|
|
|
+#endif
|
|
|
+ .cuda_flags = {STARPU_CUDA_ASYNC},
|
|
|
+ .nbuffers = 2,
|
|
|
+ .modes = {STARPU_R, STARPU_RW},
|
|
|
+ .name = "axpy",
|
|
|
+ .model = &axpy_model
|
|
|
+};
|
|
|
+
|
|
|
+static int
|
|
|
+check(int niter)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+ for (i = 0; i < N; i++)
|
|
|
+ {
|
|
|
+ float expected_value = _alpha * _vec_x[niter][i] + 4.0;
|
|
|
+ if (fabs(_vec_y[niter][i] - expected_value) > expected_value * EPSILON)
|
|
|
+ {
|
|
|
+ FPRINTF(stderr,"at %d, %f*%f+%f=%f, expected %f\n", i, _alpha, _vec_x[niter][i], 4.0, _vec_y[niter][i], expected_value);
|
|
|
+ return EXIT_FAILURE;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return EXIT_SUCCESS;
|
|
|
+}
|
|
|
+
|
|
|
+int main(int argc, char **argv)
|
|
|
+{
|
|
|
+ int ret, exit_value = 0;
|
|
|
+ int iter;
|
|
|
+ /* Initialize StarPU */
|
|
|
+ ret = starpu_init(NULL);
|
|
|
+ if (ret == -ENODEV)
|
|
|
+ return 77;
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
|
|
|
+
|
|
|
+
|
|
|
+ /* This is equivalent to
|
|
|
+ vec_a = malloc(N*sizeof(float));
|
|
|
+ vec_b = malloc(N*sizeof(float));
|
|
|
+ */
|
|
|
+ for(iter = 0; iter < NITER; iter++)
|
|
|
+ {
|
|
|
+ starpu_malloc((void **)&_vec_x[iter], N*sizeof(float));
|
|
|
+ assert(_vec_x[iter]);
|
|
|
+
|
|
|
+ starpu_malloc((void **)&_vec_y[iter], N*sizeof(float));
|
|
|
+ assert(_vec_y[iter]);
|
|
|
+
|
|
|
+ unsigned i;
|
|
|
+ for (i = 0; i < N; i++)
|
|
|
+ {
|
|
|
+ _vec_x[iter][i] = 1.0f; /*(float)starpu_drand48(); */
|
|
|
+ _vec_y[iter][i] = 4.0f; /*(float)starpu_drand48(); */
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Declare the data to StarPU */
|
|
|
+ starpu_vector_data_register(&_handle_x[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_x[iter], N, sizeof(float));
|
|
|
+ starpu_vector_data_register(&_handle_y[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_y[iter], N, sizeof(float));
|
|
|
+ }
|
|
|
+
|
|
|
+ double start;
|
|
|
+ double end;
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ int gpu_devid = -1;
|
|
|
+ int nfound_gpus = starpu_worker_get_devids(STARPU_CUDA_WORKER, &gpu_devid, 1);
|
|
|
+
|
|
|
+ printf("gpu_devid found %d \n", gpu_devid);
|
|
|
+ if(nfound_gpus == 0)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ unsigned nworkers = starpu_worker_get_count();
|
|
|
+ int stream_workerids[nworkers];
|
|
|
+
|
|
|
+ int nstreams = starpu_worker_get_stream_workerids(gpu_devid, stream_workerids, STARPU_CUDA_WORKER);
|
|
|
+
|
|
|
+ int s;
|
|
|
+ for(s = 0; s < nstreams; s++)
|
|
|
+ printf("stream w %d \n", stream_workerids[s]);
|
|
|
+
|
|
|
+ int ncpus = starpu_cpu_worker_get_count();
|
|
|
+ int workers[ncpus+nstreams];
|
|
|
+ starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
|
|
|
+
|
|
|
+ int sched_ctxs[nstreams];
|
|
|
+ int nsms[nstreams];
|
|
|
+ nsms[0] = 6;
|
|
|
+ nsms[1] = 7;
|
|
|
+
|
|
|
+ for(s = 0; s < nstreams; s++)
|
|
|
+ {
|
|
|
+ sched_ctxs[s] = starpu_sched_ctx_create(&stream_workerids[s], 1, "subctx", STARPU_SCHED_CTX_CUDA_NSMS, nsms[s], 0);
|
|
|
+ workers[ncpus+s] = stream_workerids[s];
|
|
|
+ }
|
|
|
+ unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
|
|
|
+
|
|
|
+ printf("parent ctx %d\n", sched_ctx1);
|
|
|
+ starpu_sched_ctx_set_context(&sched_ctx1);
|
|
|
+
|
|
|
+#endif
|
|
|
+ start = starpu_timing_now();
|
|
|
+
|
|
|
+ for (iter = 0; iter < NITER; iter++)
|
|
|
+ {
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+
|
|
|
+ task->cl = &axpy_cl;
|
|
|
+
|
|
|
+ task->cl_arg = &_alpha;
|
|
|
+ task->cl_arg_size = sizeof(_alpha);
|
|
|
+
|
|
|
+ task->handles[0] = _handle_x[iter];
|
|
|
+ task->handles[1] = _handle_y[iter];
|
|
|
+
|
|
|
+ ret = starpu_task_submit(task);
|
|
|
+ if (ret == -ENODEV)
|
|
|
+ {
|
|
|
+ exit_value = 77;
|
|
|
+ goto enodev;
|
|
|
+ }
|
|
|
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
|
|
|
+ }
|
|
|
+
|
|
|
+ starpu_task_wait_for_all();
|
|
|
+
|
|
|
+enodev:
|
|
|
+ for(iter = 0; iter < NITER; iter++)
|
|
|
+ {
|
|
|
+ starpu_data_unregister(_handle_x[iter]);
|
|
|
+ starpu_data_unregister(_handle_y[iter]);
|
|
|
+ }
|
|
|
+ end = starpu_timing_now();
|
|
|
+ double timing = end - start;
|
|
|
+
|
|
|
+ FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(float)/timing);
|
|
|
+
|
|
|
+// FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[iter][0], _alpha);
|
|
|
+
|
|
|
+ if (exit_value != 77)
|
|
|
+ {
|
|
|
+ for(iter = 0; iter < NITER; iter++)
|
|
|
+ {
|
|
|
+ exit_value = check(iter);
|
|
|
+ if(exit_value != EXIT_SUCCESS)
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for(iter = 0; iter < NITER; iter++)
|
|
|
+ {
|
|
|
+ starpu_free((void *)_vec_x[iter]);
|
|
|
+ starpu_free((void *)_vec_y[iter]);
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Stop StarPU */
|
|
|
+ starpu_shutdown();
|
|
|
+
|
|
|
+ return exit_value;
|
|
|
+}
|