|
|
@@ -49,7 +49,10 @@
|
|
|
/* the number of FPGA devices */
|
|
|
static unsigned nfpgafpgas = -1;
|
|
|
static fpgaDeviceProp props[STARPU_MAXFPGADEVS];
|
|
|
-static size_t global_mem[STARPU_MAXFPGADEVS] = { 128*1024*1024*1024 };
|
|
|
+static size_t global_mem[STARPU_MAXFPGADEVS] = { 128ULL*1024*1024*1024 };
|
|
|
+
|
|
|
+static void _starpu_fpga_limit_global_mem(unsigned );
|
|
|
+static size_t _starpu_fpga_get_global_mem_size(unsigned devid);
|
|
|
|
|
|
void fpga_msg(char *msg){
|
|
|
printf(FPGA_OK "%s\n" NORMAL, msg);
|
|
|
@@ -70,7 +73,7 @@ int fpga_allocate_memory(fpga_mem *ptr, size_t size){
|
|
|
//This allocates BYTES
|
|
|
char *msg1="You asked to allocate ";
|
|
|
// printf(KCYN "%s%d*%d\n" KBLU, msg1,size,sizeof(unsigned));
|
|
|
- printf(FPGA_OK "%s%d bytes\n" NORMAL, msg1,size);
|
|
|
+ printf(FPGA_OK "%s%lu bytes\n" NORMAL, msg1,size);
|
|
|
|
|
|
*ptr =(fpga_mem) malloc(size);
|
|
|
|
|
|
@@ -160,40 +163,20 @@ int _starpu_fpga_driver_init(struct _starpu_worker *worker){
|
|
|
|
|
|
static int execute_job_on_fpga(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *fpga_args, int rank, struct starpu_perfmodel_arch* perf_arch){
|
|
|
int ret;
|
|
|
- int is_parallel_task = (j->task_size > 1);
|
|
|
int profiling = starpu_profiling_status_get();
|
|
|
- struct timespec codelet_start, codelet_end;
|
|
|
|
|
|
struct starpu_task *task = j->task;
|
|
|
struct starpu_codelet *cl = task->cl;
|
|
|
-#ifdef STARPU_OPENMP
|
|
|
- /* At this point, j->continuation as been cleared as the task is being
|
|
|
- * woken up, thus we use j->discontinuous instead for the check */
|
|
|
- const unsigned continuation_wake_up = j->discontinuous;
|
|
|
-#else
|
|
|
- const unsigned continuation_wake_up = 0;
|
|
|
-#endif
|
|
|
|
|
|
STARPU_ASSERT(cl);
|
|
|
|
|
|
- if (rank == 0 && !continuation_wake_up)
|
|
|
- {
|
|
|
- ret = _starpu_fetch_task_input(task, j, fpga_args);
|
|
|
- if (ret != 0)
|
|
|
- {
|
|
|
- /* there was not enough memory so the codelet cannot be executed right now ... */
|
|
|
- /* push the codelet back and try another one ... */
|
|
|
- return -EAGAIN;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (is_parallel_task)
|
|
|
+ /* TODO: use asynchronous */
|
|
|
+ ret = _starpu_fetch_task_input(task, j, 0);
|
|
|
+ if (ret != 0)
|
|
|
{
|
|
|
- STARPU_PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
|
|
|
-
|
|
|
- /* In the case of a combined worker, the scheduler needs to know
|
|
|
- * when each actual worker begins the execution */
|
|
|
- _starpu_sched_pre_exec_hook(worker_task);
|
|
|
+ /* there was not enough memory so the codelet cannot be executed right now ... */
|
|
|
+ /* push the codelet back and try another one ... */
|
|
|
+ return -EAGAIN;
|
|
|
}
|
|
|
|
|
|
/* Give profiling variable */
|
|
|
@@ -204,13 +187,9 @@ static int execute_job_on_fpga(struct _starpu_job *j, struct starpu_task *worker
|
|
|
if ((rank == 0) || (cl->type != STARPU_FORKJOIN))
|
|
|
{
|
|
|
_starpu_cl_func_t func = _starpu_task_get_fpga_nth_implementation(cl, j->nimpl);
|
|
|
- char *kernel_type = _starpu_task_get_fpga_kernel_type_nth_implementation(cl, j->nimpl);
|
|
|
+ //char *kernel_type = _starpu_task_get_fpga_kernel_type_nth_implementation(cl, j->nimpl);
|
|
|
//printf("chanel reserved: %d \n",chnl);
|
|
|
|
|
|
- if (is_parallel_task && cl->type == STARPU_FORKJOIN)
|
|
|
- /* bind to parallel worker */
|
|
|
- _starpu_bind_thread_on_cpus(_starpu_get_combined_worker_struct(j->combined_workerid));
|
|
|
-
|
|
|
STARPU_ASSERT_MSG(func, "when STARPU_FPGA is defined in 'where', fpga_func or fpga_funcs has to be defined");
|
|
|
if (_starpu_get_disable_kernels() <= 0)
|
|
|
{
|
|
|
@@ -221,43 +200,13 @@ static int execute_job_on_fpga(struct _starpu_job *j, struct starpu_task *worker
|
|
|
func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
|
|
|
_STARPU_TRACE_END_EXECUTING();
|
|
|
}
|
|
|
- if (is_parallel_task && cl->type == STARPU_FORKJOIN)
|
|
|
- /* rebind to single CPU */
|
|
|
- _starpu_bind_thread_on_cpu(fpga_args->config, fpga_args->bindid, fpga_args->workerid);
|
|
|
}
|
|
|
|
|
|
_starpu_driver_end_job(fpga_args, j, perf_arch, rank, profiling);
|
|
|
|
|
|
- if (is_parallel_task)
|
|
|
- {
|
|
|
- STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
|
|
|
- ANNOTATE_HAPPENS_BEFORE(&j->after_work_busy_barrier);
|
|
|
- (void) STARPU_ATOMIC_ADD(&j->after_work_busy_barrier, -1);
|
|
|
- if (rank == 0)
|
|
|
- {
|
|
|
- /* Wait with a busy barrier for other workers to have
|
|
|
- * finished with the blocking barrier before we can
|
|
|
- * safely drop the job structure */
|
|
|
- while (j->after_work_busy_barrier > 0)
|
|
|
- {
|
|
|
- STARPU_UYIELD();
|
|
|
- STARPU_SYNCHRONIZE();
|
|
|
- }
|
|
|
- ANNOTATE_HAPPENS_AFTER(&j->after_work_busy_barrier);
|
|
|
- }
|
|
|
- }
|
|
|
+ _starpu_driver_update_job_feedback(j, fpga_args, perf_arch, profiling);
|
|
|
|
|
|
- if (rank == 0)
|
|
|
- {
|
|
|
- _starpu_driver_update_job_feedback(j, fpga_args, perf_arch, profiling);
|
|
|
-
|
|
|
-#ifdef STARPU_OPENMP
|
|
|
- if (!j->continuation)
|
|
|
-#endif
|
|
|
- {
|
|
|
- _starpu_push_task_output(j);
|
|
|
- }
|
|
|
- }
|
|
|
+ _starpu_push_task_output(j);
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
@@ -399,37 +348,23 @@ void *_starpu_fpga_worker(void *_arg){
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
-int _starpu_fpga_allocate_memory(int devid, fpga_mem *addr, size_t size)
|
|
|
+uintptr_t _starpu_fpga_allocate_memory(unsigned dst_node, size_t size, int flags)
|
|
|
{
|
|
|
+ (void) flags;
|
|
|
+ unsigned devid = starpu_memory_node_get_devid(dst_node);
|
|
|
+ STARPU_ASSERT(devid == 0); // For now
|
|
|
+
|
|
|
static fpga_mem current_address = 0;
|
|
|
+ fpga_mem addr;
|
|
|
// TODO: vérifier si current_address + size > taille de la LMEm
|
|
|
- *addr = current_address;
|
|
|
+ addr = current_address;
|
|
|
current_address += size;
|
|
|
-printf("fpga mem returned from allocation @: %p\n",*addr);
|
|
|
+printf("fpga mem returned from allocation @: %p\n",addr);
|
|
|
//success = 0
|
|
|
- return 0;
|
|
|
+ return (uintptr_t) addr;
|
|
|
}
|
|
|
|
|
|
|
|
|
-int _starpu_fpga_driver_init_from_worker(struct _starpu_worker *worker)
|
|
|
-{
|
|
|
- return _starpu_fpga_driver_init(worker->set);
|
|
|
-}
|
|
|
-
|
|
|
-int _starpu_fpga_run_from_worker(struct _starpu_worker *worker)
|
|
|
-{
|
|
|
- return _starpu_run_fpga(worker->set);
|
|
|
-}
|
|
|
-
|
|
|
-int _starpu_fpga_driver_run_once_from_worker(struct _starpu_worker *worker)
|
|
|
-{
|
|
|
- return _starpu_fpga_driver_run_once(worker->set);
|
|
|
-}
|
|
|
-
|
|
|
-int _starpu_fpga_driver_deinit_from_worker(struct _starpu_worker *worker)
|
|
|
-{
|
|
|
- return _starpu_fpga_driver_deinit(worker->set);
|
|
|
-}
|
|
|
|
|
|
int _starpu_fpga_copy_ram_to_fpga(void *src, void *dst, size_t size)
|
|
|
{
|
|
|
@@ -442,7 +377,7 @@ printf("ram to fpga, fpga @= %p\n",dst);
|
|
|
/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
|
|
|
* * node to the address pointed by DST in the DST_NODE memory node
|
|
|
* */
|
|
|
-void copy_ram_to_fpga(int32_t *src, int32_t dst, size_t size)
|
|
|
+void copy_ram_to_fpga(void *src, void *dst, size_t size)
|
|
|
{
|
|
|
printf("ram to fpga, fpga @= %p\n",dst);
|
|
|
|
|
|
@@ -450,9 +385,9 @@ printf("ram to fpga, fpga @= %p\n",dst);
|
|
|
|
|
|
}
|
|
|
|
|
|
-void copy_fpga_to_ram(int32_t *src, int32_t dst, size_t size)
|
|
|
+void copy_fpga_to_ram(void *src, void *dst, size_t size)
|
|
|
{
|
|
|
-printf("ram to fpga, fpga @= %p\n",dst);
|
|
|
+printf("ram to fpga, fpga @= %p\n",src);
|
|
|
//LMemLoopback_readLMem(size, src, dst);
|
|
|
|
|
|
}
|
|
|
@@ -518,10 +453,10 @@ void _starpu_fpga_transfer_data(void *buffers[], struct _starpu_job *j, int chnl
|
|
|
{
|
|
|
case STARPU_VARIABLE_INTERFACE_ID:
|
|
|
{
|
|
|
- void *ptr = STARPU_VARIABLE_GET_PTR(buffers[index]);
|
|
|
+ void *ptr = (void*) STARPU_VARIABLE_GET_PTR(buffers[index]);
|
|
|
size_t size = STARPU_VARIABLE_GET_ELEMSIZE(buffers[index]);
|
|
|
//fpga_data_send(chnl,ptr,size);
|
|
|
- printf("Driver Fpga @: %p, size %d \n",ptr,size);
|
|
|
+ printf("Driver Fpga @: %p, size %lu \n",ptr,size);
|
|
|
break;
|
|
|
}
|
|
|
case STARPU_MATRIX_INTERFACE_ID:
|
|
|
@@ -540,17 +475,17 @@ void _starpu_fpga_transfer_data(void *buffers[], struct _starpu_job *j, int chnl
|
|
|
|
|
|
int _starpu_fpga_copy_data_from_cpu_to_fpga(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t ssize, struct _starpu_async_channel *async_channel)
|
|
|
{
|
|
|
- return _starpu_fpga_copy_ram_to_fpga(src + src_offset, dst + dst_offset, size);
|
|
|
+ return _starpu_fpga_copy_ram_to_fpga((void*) src + src_offset, (void*) dst + dst_offset, ssize);
|
|
|
}
|
|
|
|
|
|
int _starpu_fpga_copy_data_from_fpga_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t ssize, struct _starpu_async_channel *async_channel)
|
|
|
{
|
|
|
- return _starpu_fpga_copy_fpga_to_ram(src + src_offset, dst + dst_offset, size);
|
|
|
+ return _starpu_fpga_copy_fpga_to_ram((void*) src + src_offset, (void*) dst + dst_offset, ssize);
|
|
|
}
|
|
|
|
|
|
int _starpu_fpga_copy_data_from_fpga_to_fpga(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t ssize, struct _starpu_async_channel *async_channel)
|
|
|
{
|
|
|
- return _starpu_fpga_copy_fpga_to_fpga(src + src_offset, dst + dst_offset, size);
|
|
|
+ return _starpu_fpga_copy_fpga_to_fpga((void*) src + src_offset, (void*) dst + dst_offset, ssize);
|
|
|
}
|
|
|
|
|
|
int _starpu_fpga_copy_interface_from_fpga_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
|
|
|
@@ -560,6 +495,8 @@ int _starpu_fpga_copy_interface_from_fpga_to_cpu(starpu_data_handle_t handle, vo
|
|
|
|
|
|
STARPU_ASSERT(src_kind == STARPU_FPGA_RAM && dst_kind == STARPU_CPU_RAM);
|
|
|
|
|
|
+ int ret = 1;
|
|
|
+
|
|
|
const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
|
|
|
if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_fpga_copy_disabled() ||
|
|
|
!(copy_methods->fpga_to_ram_async || copy_methods->any_to_any))
|
|
|
@@ -575,14 +512,15 @@ int _starpu_fpga_copy_interface_from_fpga_to_cpu(starpu_data_handle_t handle, vo
|
|
|
{
|
|
|
//req->async_channel.type = STARPU_FPGA_RAM;
|
|
|
if (copy_methods->fpga_to_ram_async)
|
|
|
- copy_methods->fpga_to_ram_async(src_interface, src_node, dst_interface, dst_node);
|
|
|
+ ret = copy_methods->fpga_to_ram_async(src_interface, src_node, dst_interface, dst_node);
|
|
|
else
|
|
|
{
|
|
|
STARPU_ASSERT(copy_methods->any_to_any);
|
|
|
- copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
|
|
|
+ ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
|
|
|
}
|
|
|
//_starpu_fpga_init_event(&(req->async_channel.event.fpga_event), src_node);
|
|
|
}
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
int _starpu_fpga_copy_interface_from_cpu_to_fpga(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
|
|
|
@@ -621,10 +559,10 @@ int _starpu_fpga_copy_interface_from_cpu_to_fpga(starpu_data_handle_t handle, vo
|
|
|
|
|
|
struct _starpu_driver_ops _starpu_driver_fpga_ops =
|
|
|
{
|
|
|
- .init = _starpu_fpga_driver_init_from_worker,
|
|
|
- .run = _starpu_fpga_run_from_worker,
|
|
|
- .run_once = _starpu_fpga_driver_run_once_from_worker,
|
|
|
- .deinit = _starpu_fpga_driver_deinit_from_worker
|
|
|
+ .init = _starpu_fpga_driver_init,
|
|
|
+ .run = _starpu_run_fpga,
|
|
|
+ .run_once = _starpu_fpga_driver_run_once,
|
|
|
+ .deinit = _starpu_fpga_driver_deinit
|
|
|
};
|
|
|
|
|
|
// TODO: structure node_ops, comme dans driver_cuda.c, avec starpu_fpga_allocate_memory, etc.
|