Browse Source

simgrid: Add GPU synchronization point on cudaFree, like CUDA does

Samuel Thibault 8 years ago
parent
commit
0645ff44a6
3 changed files with 70 additions and 4 deletions
  1. 67 4
      src/core/simgrid.c
  2. 1 0
      src/core/simgrid.h
  3. 2 0
      src/datawizard/malloc.c

+ 67 - 4
src/core/simgrid.c

@@ -687,10 +687,13 @@ static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARP
 		if (t->last_transfer == transfer)
 			t->last_transfer = NULL;
 
-		_STARPU_DEBUG("transfer %p started\n", transfer);
-		MSG_task_execute(transfer->task);
-		MSG_task_destroy(transfer->task);
-		_STARPU_DEBUG("transfer %p finished\n", transfer);
+		if (transfer->task)
+		{
+			_STARPU_DEBUG("transfer %p started\n", transfer);
+			MSG_task_execute(transfer->task);
+			MSG_task_destroy(transfer->task);
+			_STARPU_DEBUG("transfer %p finished\n", transfer);
+		}
 
 		*transfer->finished = 1;
 		transfer_list_erase(&pending, transfer);
@@ -776,6 +779,60 @@ int _starpu_simgrid_test_transfer_event(union _starpu_async_channel_event *event
 	return event->finished;
 }
 
+/* Wait for completion of all transfers */
+static void _starpu_simgrid_wait_transfers(void)
+{
+	unsigned finished = 0;
+	struct transfer *sync = transfer_new();
+	struct transfer *cur;
+
+	sync->task = NULL;
+	sync->finished = &finished;
+
+	sync->src_node = STARPU_MAIN_RAM;
+	sync->dst_node = STARPU_MAIN_RAM;
+	sync->run_node = STARPU_MAIN_RAM;
+
+	sync->wake = NULL;
+	sync->nwake = 0;
+	sync->nwait = 0;
+	sync->next = NULL;
+
+	for (cur  = transfer_list_begin(&pending);
+	     cur != transfer_list_end(&pending);
+	     cur  = transfer_list_next(cur))
+	{
+		sync->nwait++;
+		_STARPU_REALLOC(cur->wake, (cur->nwake + 1) * sizeof(cur->wake));
+		cur->wake[cur->nwake] = sync;
+		cur->nwake++;
+	}
+
+	if (sync->nwait == 0)
+	{
+		/* No transfer to wait for */
+		free(sync);
+		return;
+	}
+
+	/* Push synchronization pseudo-transfer */
+	transfer_list_push_front(&pending, sync);
+
+	/* And wait for it */
+	starpu_pthread_wait_t wait;
+	starpu_pthread_wait_init(&wait);
+	starpu_pthread_queue_register(&wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
+	while(1)
+	{
+		starpu_pthread_wait_reset(&wait);
+		if (finished)
+			break;
+		starpu_pthread_wait_wait(&wait);
+	}
+	starpu_pthread_queue_unregister(&wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
+	starpu_pthread_wait_destroy(&wait);
+}
+
 /* Data transfer issued by StarPU */
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req)
 {
@@ -845,6 +902,12 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 	}
 }
 
+/* Sync all GPUs (used on CUDA Free, typically) */
+void _starpu_simgrid_sync_gpus(void)
+{
+	_starpu_simgrid_wait_transfers();
+}
+
 int
 _starpu_simgrid_thread_start(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
 {

+ 1 - 0
src/core/simgrid.h

@@ -47,6 +47,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct st
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
 int _starpu_simgrid_wait_transfer_event(union _starpu_async_channel_event *event);
 int _starpu_simgrid_test_transfer_event(union _starpu_async_channel_event *event);
+void _starpu_simgrid_sync_gpus(void);
 /* Return the number of hosts prefixed by PREFIX */
 int _starpu_simgrid_get_nbhosts(const char *prefix);
 unsigned long long _starpu_simgrid_get_memsize(const char *prefix, unsigned devid);

+ 2 - 0
src/datawizard/malloc.c

@@ -645,6 +645,8 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 			if (_starpu_simgrid_cuda_malloc_cost())
 				MSG_process_sleep(0.000750);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
+			/* CUDA also synchronizes roughly everything on cudaFree */
+			_starpu_simgrid_sync_gpus();
 #else
 			cudaError_t err;
 			unsigned devid = _starpu_memory_node_get_devid(dst_node);