/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2010-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria * Copyright (C) 2013 Thibaut Lambert * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ #include #include #include #include #include #include #include #include static void common_data_cpy_func(void *descr[], void *cl_arg) { unsigned interface_id = *(unsigned *)cl_arg; const struct starpu_data_interface_ops *interface_ops = _starpu_data_interface_get_ops(interface_id); const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods; int workerid = starpu_worker_get_id_check(); enum starpu_worker_archtype type = starpu_worker_get_type(workerid); unsigned memory_node = starpu_worker_get_memory_node(workerid); void *dst_interface = descr[0]; void *src_interface = descr[1]; switch (type) { case STARPU_CPU_WORKER: if (copy_methods->ram_to_ram) { copy_methods->ram_to_ram(src_interface, memory_node, dst_interface, memory_node); return; } break; #ifdef STARPU_USE_CUDA case STARPU_CUDA_WORKER: { cudaStream_t stream = starpu_cuda_get_local_stream(); if (copy_methods->cuda_to_cuda_async) { copy_methods->cuda_to_cuda_async(src_interface, memory_node, dst_interface, memory_node, stream); return; } else if (copy_methods->cuda_to_cuda) { copy_methods->cuda_to_cuda(src_interface, memory_node, dst_interface, memory_node); return; } break; } #endif case STARPU_OPENCL_WORKER: if (copy_methods->opencl_to_opencl) { copy_methods->opencl_to_opencl(src_interface, memory_node, dst_interface, memory_node); return; } break; default: /* unknown architecture */ STARPU_ABORT(); } STARPU_ASSERT(copy_methods->any_to_any); copy_methods->any_to_any(src_interface, memory_node, dst_interface, memory_node, NULL); } void mp_cpy_kernel(void *descr[], void *cl_arg) { unsigned interface_id = *(unsigned *)cl_arg; const struct starpu_data_interface_ops *interface_ops = _starpu_data_interface_get_ops(interface_id); const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods; void *dst_interface = descr[0]; void *src_interface = descr[1]; if(copy_methods->ram_to_ram) copy_methods->ram_to_ram(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM); else if(copy_methods->any_to_any) copy_methods->any_to_any(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM, NULL); else STARPU_ABORT(); } static starpu_mic_kernel_t mic_cpy_func() { #ifdef STARPU_USE_MIC starpu_mic_func_symbol_t mic_symbol = NULL; starpu_mic_register_kernel(&mic_symbol, "mp_cpy_kernel"); return starpu_mic_get_kernel(mic_symbol); #else STARPU_ABORT(); return NULL; #endif } struct starpu_perfmodel copy_model = { .type = STARPU_HISTORY_BASED, .symbol = "starpu_data_cpy" }; static struct starpu_codelet copy_cl = { .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_MIC, .cpu_funcs = {common_data_cpy_func}, .cuda_funcs = {common_data_cpy_func}, .opencl_funcs = {common_data_cpy_func}, .mic_funcs = {mic_cpy_func}, .nbuffers = 2, .modes = {STARPU_W, STARPU_R}, .model = ©_model }; int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg, int reduction, struct starpu_task *reduction_dep_task) { struct starpu_task *task = starpu_task_create(); STARPU_ASSERT(task); task->name = "data_cpy"; struct _starpu_job *j = _starpu_get_job_associated_to_task(task); if (reduction) { j->reduction_task = reduction; if (reduction_dep_task) starpu_task_declare_deps_array(task, 1, &reduction_dep_task); } task->cl = ©_cl; unsigned *interface_id; _STARPU_MALLOC(interface_id, sizeof(*interface_id)); *interface_id = dst_handle->ops->interfaceid; task->cl_arg = interface_id; task->cl_arg_size = sizeof(*interface_id); task->cl_arg_free = 1; task->priority = STARPU_MAX_PRIO; //TODO: make it as a parameter task->callback_func = callback_func; task->callback_arg = callback_arg; STARPU_TASK_SET_HANDLE(task, dst_handle, 0); STARPU_TASK_SET_HANDLE(task, src_handle, 1); task->synchronous = !asynchronous; int ret = _starpu_task_submit_internally(task); STARPU_ASSERT(!ret); return 0; } int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg) { return _starpu_data_cpy(dst_handle, src_handle, asynchronous, callback_func, callback_arg, 0, NULL); } /* TODO: implement copy on write, and introduce starpu_data_dup as well */ int starpu_data_dup_ro(starpu_data_handle_t *dst_handle, starpu_data_handle_t src_handle, int asynchronous) { _starpu_spin_lock(&src_handle->header_lock); if (src_handle->readonly_dup) { /* Already a ro duplicate, just return it with one more ref */ *dst_handle = src_handle->readonly_dup; _starpu_spin_unlock(&src_handle->header_lock); _starpu_spin_lock(&(*dst_handle)->header_lock); (*dst_handle)->aliases++; _starpu_spin_unlock(&(*dst_handle)->header_lock); return 0; } if (src_handle->readonly) { src_handle->aliases++; _starpu_spin_unlock(&src_handle->header_lock); *dst_handle = src_handle; return 0; } _starpu_spin_unlock(&src_handle->header_lock); starpu_data_register_same(dst_handle, src_handle); _starpu_data_cpy(*dst_handle, src_handle, asynchronous, NULL, NULL, 0, NULL); (*dst_handle)->readonly = 1; _starpu_spin_lock(&src_handle->header_lock); src_handle->readonly_dup = (*dst_handle); (*dst_handle)->readonly_dup_of = src_handle; _starpu_spin_unlock(&src_handle->header_lock); return 0; }