/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2016 Uppsala University * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ /* * This creates two dumb vectors & run axpy on them. * You have to set STARPU_NWORKER_PER_CUDA=2 */ #include #include #include #include #include #include #ifdef STARPU_USE_CUDA #include #endif #define N 512*512 #define NITER 100 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0) #define EPSILON 1e-6 float *_vec_x[NITER], *_vec_y[NITER]; float _alpha = 3.41; /* descriptors for StarPU */ starpu_data_handle_t _handle_y[NITER], _handle_x[NITER]; void axpy_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { float alpha = *((float *)arg); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]); float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]); unsigned i; for( i = 0; i < n; i++) block_y[i] = alpha * block_x[i] + block_y[i]; } #ifdef STARPU_USE_CUDA extern void cuda_axpy(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args); #endif static struct starpu_perfmodel axpy_model = { .type = STARPU_HISTORY_BASED, .symbol = "axpy" }; static struct starpu_codelet axpy_cl = { /* .cpu_funcs = {axpy_cpu}, */ /* .cpu_funcs_name = {"axpy_cpu"}, */ #ifdef STARPU_USE_CUDA .cuda_funcs = {cuda_axpy}, #elif defined(STARPU_SIMGRID) .cuda_funcs = {(void*)1}, #endif .cuda_flags = {STARPU_CUDA_ASYNC}, .nbuffers = 2, .modes = {STARPU_R, STARPU_RW}, .name = "axpy", .model = &axpy_model }; static int check(int niter) { int i; for (i = 0; i < N; i++) { float expected_value = _alpha * _vec_x[niter][i] + 4.0; if (fabs(_vec_y[niter][i] - expected_value) > expected_value * EPSILON) { FPRINTF(stderr,"[error] at %d, %f*%f+%f==%f, expected %f\n", i, _alpha, _vec_x[niter][i], 4.0, _vec_y[niter][i], expected_value); return EXIT_FAILURE; } } return EXIT_SUCCESS; } int main(int argc, char **argv) { int ret, exit_value = 0; int iter; int ncuda = 0; int gpu_devid = -1; #ifndef STARPU_HAVE_SETENV return STARPU_TEST_SKIPPED; #endif /* Request separate threads for streams */ setenv("STARPU_ONE_THREAD_PER_STREAM", "1"); /* Initialize StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); #ifdef STARPU_USE_CUDA ncuda = starpu_worker_get_devids(STARPU_CUDA_WORKER, &gpu_devid, 1); FPRINTF(stderr, "gpu_devid found %d \n", gpu_devid); #endif if (ncuda == 0) { starpu_shutdown(); return 77; } /* This is equivalent to vec_a = malloc(N*sizeof(float)); vec_b = malloc(N*sizeof(float)); */ for(iter = 0; iter < NITER; iter++) { starpu_malloc((void **)&_vec_x[iter], N*sizeof(float)); assert(_vec_x[iter]); starpu_malloc((void **)&_vec_y[iter], N*sizeof(float)); assert(_vec_y[iter]); unsigned i; for (i = 0; i < N; i++) { _vec_x[iter][i] = 1.0f; /*(float)starpu_drand48(); */ _vec_y[iter][i] = 4.0f; /*(float)starpu_drand48(); */ } /* Declare the data to StarPU */ starpu_vector_data_register(&_handle_x[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_x[iter], N, sizeof(float)); starpu_vector_data_register(&_handle_y[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_y[iter], N, sizeof(float)); } double start; double end; #ifdef STARPU_USE_CUDA unsigned nworkers = starpu_worker_get_count(); int stream_workerids[nworkers]; int nstreams = starpu_worker_get_stream_workerids(gpu_devid, stream_workerids, STARPU_CUDA_WORKER); int s; for(s = 0; s < nstreams; s++) FPRINTF(stderr, "stream w %d \n", stream_workerids[s]); int ncpus = starpu_cpu_worker_get_count(); int workers[ncpus+nstreams]; starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus); int sched_ctxs[nstreams]; int nsms[nstreams]; nsms[0] = 6; nsms[1] = 7; for(s = 0; s < nstreams; s++) { sched_ctxs[s] = starpu_sched_ctx_create(&stream_workerids[s], 1, "subctx", STARPU_SCHED_CTX_CUDA_NSMS, nsms[s], 0); workers[ncpus+s] = stream_workerids[s]; } unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0); FPRINTF(stderr, "parent ctx %d\n", sched_ctx1); starpu_sched_ctx_set_context(&sched_ctx1); #endif start = starpu_timing_now(); for (iter = 0; iter < NITER; iter++) { struct starpu_task *task = starpu_task_create(); task->cl = &axpy_cl; task->cl_arg = &_alpha; task->cl_arg_size = sizeof(_alpha); task->handles[0] = _handle_x[iter]; task->handles[1] = _handle_y[iter]; ret = starpu_task_submit(task); if (ret == -ENODEV) { exit_value = 77; goto enodev; } STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } starpu_task_wait_for_all(); enodev: for(iter = 0; iter < NITER; iter++) { starpu_data_unregister(_handle_x[iter]); starpu_data_unregister(_handle_y[iter]); } end = starpu_timing_now(); double timing = end - start; FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(float)/timing); // FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[iter][0], _alpha); if (exit_value != 77) { for(iter = 0; iter < NITER; iter++) { exit_value = check(iter); if(exit_value != EXIT_SUCCESS) break; } } for(iter = 0; iter < NITER; iter++) { starpu_free((void *)_vec_x[iter]); starpu_free((void *)_vec_y[iter]); } /* Stop StarPU */ starpu_shutdown(); return exit_value; }