浏览代码

Separation between event enqueueing and task submission

In preparation for task execution postponing, events can be enqueued in command queues without executing their associated task. Task dependencies are returned on event enqueueing.
Sylvain Henry 13 年之前
父节点
当前提交
d5a3dd84c9

+ 1 - 1
socl/src/cl_enqueuebarrier.c

@@ -19,7 +19,7 @@
 CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueBarrier(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
 {
-   cl_event ev = enqueueBarrier(cq);   
+   cl_event ev = command_queue_barrier(cq);   
    gc_entity_release(ev);
 
    return CL_SUCCESS;

+ 8 - 3
socl/src/cl_enqueuecopybuffer.c

@@ -81,6 +81,9 @@ soclEnqueueCopyBuffer(cl_command_queue  cq,
    struct arg_copybuffer *arg;
    cl_event ev;
 
+   cl_int ndeps;
+   cl_event *deps;
+
    task = task_create(CL_COMMAND_COPY_BUFFER);
    ev = task_event(task);
 
@@ -103,9 +106,11 @@ soclEnqueueCopyBuffer(cl_command_queue  cq,
 
    DEBUG_MSG("Submitting CopyBuffer task (event %d)\n", ev->id);
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
+   command_queue_enqueue(cq, task_event(task), 0, num_events, events, &ndeps, &deps);
+
+   task_submit(task, ndeps, deps);
 
-   RETURN_EVENT(ev, event);
+   RETURN_OR_RELEASE_EVENT(ev, event);
 
-   return ret;
+   return CL_SUCCESS;
 }

+ 7 - 2
socl/src/cl_enqueuemapbuffer.c

@@ -57,6 +57,8 @@ soclEnqueueMapBuffer(cl_command_queue cq,
    struct mb_data *arg;
    cl_event ev;
    cl_int err;
+   cl_int ndeps;
+   cl_event *deps;
 
    /* Create custom event that will be triggered when map is complete */
    ev = event_create();
@@ -73,7 +75,10 @@ soclEnqueueMapBuffer(cl_command_queue cq,
 
    /* Enqueue task */
    DEBUG_MSG("Submitting MapBuffer task (event %d)\n", ev->id);
-   err = command_queue_enqueue_fakeevent(cq, task, 0, num_events, events, ev);
+   command_queue_enqueue(cq, ev, 0, num_events, events, &ndeps, &deps);
+
+   task_submit(task, ndeps, deps);
+
    gc_entity_release(map_event);
 
    if (errcode_ret != NULL)
@@ -85,7 +90,7 @@ soclEnqueueMapBuffer(cl_command_queue cq,
    if (blocking_map == CL_TRUE)
       soclWaitForEvents(1, &ev);
 
-   RETURN_EVENT(ev, event);
+   RETURN_OR_RELEASE_EVENT(ev, event);
 
    return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
 }

+ 12 - 5
socl/src/cl_enqueuemarker.c

@@ -20,11 +20,18 @@ CL_API_ENTRY cl_int CL_API_CALL
 soclEnqueueMarker(cl_command_queue  cq,
                 cl_event *          event) CL_API_SUFFIX__VERSION_1_0
 {
-   if (event == NULL)
-      return CL_INVALID_VALUE;
+	if (event == NULL)
+		return CL_INVALID_VALUE;
 
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
-   *event = task_event(task);
+	cl_int ndeps;
+	cl_event *deps;
 
-   return command_queue_enqueue(cq, task, 0, 0, NULL);
+	starpu_task * task = task_create(CL_COMMAND_MARKER);
+	*event = task_event(task);
+
+	command_queue_enqueue(cq, task_event(task), 0, 0, NULL, &ndeps, &deps);
+
+	task_submit(task, ndeps, deps);
+
+	return task_event(task);
 }

+ 36 - 53
socl/src/cl_enqueuendrangekernel.c

@@ -132,7 +132,7 @@ static struct starpu_perfmodel_t perf_model = {
 /**
  * Real kernel enqueuing command
  */
-cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
+cl_int graph_play_enqueue_kernel(node_enqueue_kernel n) {
 
    struct starpu_task *task;
    running_kernel arg;
@@ -143,13 +143,15 @@ cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
    cl_command_queue cq = n->cq;
    cl_kernel        kernel = n->kernel;
    cl_uint          work_dim = n->work_dim;
-   const size_t *   global_work_offset = n->global_work_offset;
-   const size_t *   global_work_size = n->global_work_size;
-   const size_t *   local_work_size = n->local_work_size;
-   cl_uint          num_events = n->num_events;
-   const cl_event * events = n->events;
-   cl_event *       event = n->event;
+   size_t *	    global_work_offset = (size_t*)n->global_work_offset;
+   size_t *   	    global_work_size = (size_t*)n->global_work_size;
+   size_t *   	    local_work_size = (size_t*)n->local_work_size;
+   cl_uint          num_events = n->node.num_events;
+   const cl_event * events = n->node.events;
+   cl_event         event = n->node.event;
    char 	    is_task = n->is_task;
+   cl_int ndeps;
+   cl_event *deps;
 
 
    /* Allocate structures */
@@ -166,9 +168,15 @@ cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
       return CL_OUT_OF_HOST_MEMORY;
    }
 
-   /* StarPU task */
-   task = task_create(is_task ? CL_COMMAND_TASK : CL_COMMAND_NDRANGE_KERNEL);
-   ev = task_event(task);
+	/* StarPU task */
+	if (event != NULL) {
+		task = task_create_with_event(is_task ? CL_COMMAND_TASK : CL_COMMAND_NDRANGE_KERNEL, event);
+	}
+	else {
+		
+		task = task_create(is_task ? CL_COMMAND_TASK : CL_COMMAND_NDRANGE_KERNEL);
+	}
+	ev = task_event(task);
 
    /*******************
     * Initializations *
@@ -190,23 +198,9 @@ cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
    arg->work_dim = work_dim;
    arg->codelet = codelet;
 
-   /* Global work offset */
-   if (global_work_offset != NULL) {
-      arg->global_work_offset = (size_t*)malloc(sizeof(size_t)*work_dim);
-      memcpy(arg->global_work_offset, global_work_offset, work_dim*sizeof(size_t));
-   }
-   else arg->global_work_offset = NULL;
-
-   /* Global work size */
-   arg->global_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
-   memcpy(arg->global_work_size, global_work_size, work_dim*sizeof(size_t));
-
-   /* Local work size */
-   if (local_work_size != NULL) {
-      arg->local_work_size = (size_t*)malloc(sizeof(size_t)*work_dim);
-      memcpy(arg->local_work_size, local_work_size, work_dim*sizeof(size_t));
-   }
-   else arg->local_work_size = NULL;
+   arg->global_work_offset = memdup_safe(global_work_offset, sizeof(size_t)*work_dim);
+   arg->global_work_size = memdup_safe(global_work_size, sizeof(size_t)*work_dim);
+   arg->local_work_size = memdup_safe(local_work_size, sizeof(size_t)*work_dim);
 
    /* ----------- *
     * StarPU task *
@@ -256,37 +250,22 @@ cl_int node_play_enqueue_kernel(node_enqueue_kernel n) {
    }
 
    /* Copy arguments as kernel args can be modified by the time we launch the kernel */
-   {
-      arg->arg_count = kernel->arg_count;
-      arg->arg_size = malloc(sizeof(size_t) * kernel->arg_count);
-      memcpy(arg->arg_size, kernel->arg_size, sizeof(size_t) * kernel->arg_count);
-      arg->arg_type = malloc(sizeof(enum kernel_arg_type) * kernel->arg_count);
-      memcpy(arg->arg_type, kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->arg_count);
-      arg->arg_value = malloc(sizeof(void*) * kernel->arg_count);
-      unsigned int i;
-      for (i=0; i<kernel->arg_count; i++) {
-         if (kernel->arg_value[i] != NULL) {
-           arg->arg_value[i] = malloc(arg->arg_size[i]);
-           memcpy(arg->arg_value[i], kernel->arg_value[i], arg->arg_size[i]);
-         }
-         else arg->arg_value[i] = NULL;
-      }
-   }
+   arg->arg_count = kernel->arg_count;
+   arg->arg_size = memdup(kernel->arg_size, sizeof(size_t) * kernel->arg_count);
+   arg->arg_type = memdup(kernel->arg_type, sizeof(enum kernel_arg_type) * kernel->arg_count);
+   arg->arg_value = memdup_deep_varsize_safe(kernel->arg_value, kernel->arg_count, kernel->arg_size);
 
    DEBUG_MSG("Submitting NDRange task (event %d)\n", ev->id);
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
+   command_queue_enqueue(cq, task_event(task), 0, num_events, events, &ndeps, &deps);
+
+   task_submit(task, ndeps, deps);
 
    /* Enqueue a cleaning task */
    starpu_task * cleaning_task = task_create_cpu(0, cleaning_task_callback, arg,1);
-   cl_event cleaning_event = task_event(cleaning_task);
-   command_queue_enqueue(cq, cleaning_task, 0, 1, &ev);
-
-   gc_entity_release(cleaning_event);
+   task_submit(cleaning_task, 1, &ev);
   
-   RETURN_EVENT(ev, event);
-
-   return ret;
+   return CL_SUCCESS;
 }
 
 /**
@@ -306,12 +285,16 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 	node_enqueue_kernel n;
 
 	n = graph_create_enqueue_kernel(0, cq, kernel, work_dim, global_work_offset, global_work_size,
-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
+		local_work_size, num_events, events, kernel->arg_count, kernel->arg_size,
 		kernel->arg_type, kernel->arg_value);
 	
 	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
-	node_play_enqueue_kernel(n);
+	graph_play_enqueue_kernel(n);
+	graph_free(n);
 
 	//graph_store(n);
+
+	RETURN_OR_RELEASE_EVENT(n->node.event, event);
+
 	return CL_SUCCESS;
 }

+ 8 - 3
socl/src/cl_enqueuereadbuffer.c

@@ -81,6 +81,9 @@ soclEnqueueReadBuffer(cl_command_queue  cq,
    struct arg_readbuffer *arg;
    cl_event ev;
 
+   cl_int ndeps;
+   cl_event *deps;
+
    task = task_create(CL_COMMAND_READ_BUFFER);
    ev = task_event(task);
 
@@ -101,9 +104,11 @@ soclEnqueueReadBuffer(cl_command_queue  cq,
 
    DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events, events);
+   command_queue_enqueue(cq, task_event(task), 0, num_events, events, &ndeps, &deps);
+
+   task_submit(task, ndeps, deps);
 
-   RETURN_EVENT(ev, event);
+   RETURN_OR_RELEASE_EVENT(ev, event);
 
-   return ret;
+   return CL_SUCCESS;
 }

+ 6 - 2
socl/src/cl_enqueuetask.c

@@ -34,12 +34,16 @@ soclEnqueueTask(cl_command_queue cq,
 	node_enqueue_kernel n;
 
 	n = graph_create_enqueue_kernel(1, cq, kernel, work_dim, global_work_offset, global_work_size,
-		local_work_size, num_events, events, event, kernel->arg_count, kernel->arg_size,
+		local_work_size, num_events, events, kernel->arg_count, kernel->arg_size,
 		kernel->arg_type, kernel->arg_value);
 	
 	//FIXME: temporarily, we execute the node directly. In the future, we will postpone this.
-	node_play_enqueue_kernel(n);
+	graph_play_enqueue_kernel(n);
+	graph_free(n);
 
 	//graph_store(n);
+
+	RETURN_OR_RELEASE_EVENT(n->node.event, event);
+
 	return CL_SUCCESS;
 }

+ 8 - 3
socl/src/cl_enqueueunmapmemobject.c

@@ -27,6 +27,8 @@ soclEnqueueUnmapMemObject(cl_command_queue cq,
    struct starpu_task *task;
    cl_int err;
    cl_event ev;
+   cl_int ndeps;
+   cl_event *deps;
 
    /* Create StarPU task */
    task = task_create_cpu(CL_COMMAND_UNMAP_MEM_OBJECT, (void(*)(void*))starpu_data_release, memobj->handle, 0);
@@ -34,9 +36,12 @@ soclEnqueueUnmapMemObject(cl_command_queue cq,
 
    DEBUG_MSG("Submitting UnmapBuffer task (event %d)\n", task->tag_id);
 
-   err = command_queue_enqueue(cq, task, 0, num_events, events);
+   command_queue_enqueue(cq, task_event(task), 0, num_events, events, &ndeps, &deps);
 
-   RETURN_EVENT(ev, event);
+   task_submit(task, ndeps, deps);
 
-   return err;
+
+   RETURN_OR_RELEASE_EVENT(ev, event);
+
+   return CL_SUCCESS;
 }

+ 10 - 4
socl/src/cl_enqueuewaitforevents.c

@@ -22,10 +22,16 @@ soclEnqueueWaitForEvents(cl_command_queue cq,
                        const cl_event * events) CL_API_SUFFIX__VERSION_1_0
 {
 
-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_WAIT_FOR_EVENTS doesn't exist
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
+	cl_int ndeps;
+	cl_event *deps;
 
-   command_queue_enqueue(cq, task, 0, num_events, events);
+	//CL_COMMAND_MARKER has been chosen as CL_COMMAND_WAIT_FOR_EVENTS doesn't exist
+	starpu_task * task = task_create(CL_COMMAND_MARKER);
 
-   return CL_SUCCESS;
+	DEBUG_MSG("Submitting WAIT_FOR_EVENTS task (event %d)\n", task->tag_id);
+	command_queue_enqueue(cq, task_event(task), 1, num_events, events, &ndeps, &deps);
+
+	task_submit(task, ndeps, deps);
+
+	return CL_SUCCESS;
 }

+ 8 - 3
socl/src/cl_enqueuewritebuffer.c

@@ -83,6 +83,9 @@ soclEnqueueWriteBuffer(cl_command_queue cq,
    struct arg_writebuffer *arg;
    cl_event ev;
 
+   cl_int ndeps;
+   cl_event *deps;
+
    task = task_create(CL_COMMAND_WRITE_BUFFER);
    ev = task_event(task);
 
@@ -110,10 +113,12 @@ soclEnqueueWriteBuffer(cl_command_queue cq,
 
    DEBUG_MSG("Submitting EnqueueRWBuffer task (event %d)\n", ev->id);
 
-   cl_int ret = command_queue_enqueue(cq, task, 0, num_events,events);
+   command_queue_enqueue(cq, task_event(task), 0, num_events, events, &ndeps, &deps);
+
+   task_submit(task, ndeps, deps);
 
    /* Return retained event if required by user */
-   RETURN_EVENT(ev,event);
+   RETURN_OR_RELEASE_EVENT(ev,event);
 
-   return ret;
+   return CL_SUCCESS;
 }

+ 1 - 1
socl/src/cl_finish.c

@@ -19,7 +19,7 @@
 CL_API_ENTRY cl_int CL_API_CALL
 soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0
 {
-   cl_event ev = enqueueBarrier(cq);
+   cl_event ev = command_queue_barrier(cq);
    soclWaitForEvents(1, &ev);
    gc_entity_release(ev);
 

+ 142 - 55
socl/src/command_queue.c

@@ -24,74 +24,161 @@
  * its command queue.
  */
 
+
 /**
- * Enqueue the given task but put fake_event into the command queue.
- * This is used when a tag notified by application is used (cf clEnqueueMapBuffer, etc.)
+ * Returned implicit dependencies for a task
+ * Command queue must be locked!
  */
-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event) {
-
-  int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
-
-  /* Set explicit task dependencies */
-  task_dependency_add(task, num_events, events);
-
-  /* Lock command queue */
-  pthread_spin_lock(&cq->spin);
-
-  /* Add dependency to last barrier if applicable */
-  if (cq->barrier != NULL)
-    task_dependency_add(task, 1, &cq->barrier);
-
-  /* Add dependencies to out-of-order events (if any) */
-  if (barrier) {
-    while (cq->events != NULL) {
-      task_dependency_add(task, 1, &cq->events);
-      cq->events = cq->events->next;
-    }
-  }
-
-  cl_event ev = (fake_event == NULL ? task_event(task) : fake_event);
+void command_queue_dependencies_implicit(
+	cl_command_queue cq, 	/* Command queue */
+	char is_barrier,	/* Is the task a barrier */
+	cl_int * ret_num_events,	/* Returned number of dependencies */
+	cl_event ** ret_events	/* Returned dependencies */
+) {
+
+	/*********************
+	 * Count dependencies
+	 *********************/
+	int ndeps = 0;
+
+	/* Add dependency to last barrier if applicable */
+	if (cq->barrier != NULL)
+		ndeps++;
+
+	/* Add dependencies to out-of-order events (if any) */
+	if (is_barrier) {
+		cl_event ev = cq->events;
+		while (ev != NULL) {
+			ndeps++;
+			ev = ev->next;
+		}
+	}
+
+	/*********************
+	 * Return dependencies
+	 *********************/
+
+	cl_event * evs = malloc(ndeps * sizeof(cl_event));
+	int n = 0;
+
+	/* Add dependency to last barrier if applicable */
+	if (cq->barrier != NULL)
+		evs[n++] = cq->barrier;
+
+	/* Add dependencies to out-of-order events (if any) */
+	if (is_barrier) {
+		cl_event ev = cq->events;
+		while (ev != NULL) {
+			evs[n++] = ev;
+			ev = ev->next;
+		}
+	}
+
+	*ret_num_events = ndeps;
+	*ret_events = evs;
+}
+	
+/**
+ * Insert a task in the command queue
+ * The command queue must be locked!
+ */
+void command_queue_insert(
+	cl_command_queue cq, 	/* Command queue */
+	cl_event task_event,	/* Event for the task */
+	char is_barrier		/* Is the task a barrier */
+) {
+
+	int in_order = !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+
+	/*********************
+	 * Insert event
+	 *********************/
+
+	if (is_barrier)
+		cq->events = NULL;
+
+	/* Add event to the list of out-of-order events */
+	if (!in_order) {
+		task_event->next = cq->events;
+		task_event->prev = NULL;
+		if (cq->events != NULL)
+			cq->events->prev = task_event;
+		cq->events = task_event;
+	}
+
+	/* Register this event as last barrier */
+	if (is_barrier || in_order)
+		cq->barrier = task_event;
+
+	/* Add reference to the command queue */
+	gc_entity_store(&task_event->cq, cq);
+}
 
-  /* Add event to the list of out-of-order events */
-  if (!in_order) {
-    ev->next = cq->events;
-    ev->prev = NULL;
-    if (cq->events != NULL)
-      cq->events->prev = ev;
-    cq->events = ev;
-  }
+/**
+ * Return implicit and explicit dependencies for a task
+ * The command queue must be locked!
+ */
+void command_queue_dependencies(
+	cl_command_queue cq,	/* Command queue */
+	char is_barrier,	/* Is the task a barrier */
+	cl_int num_events,	/* Number of explicit dependencies */
+	const cl_event events,	/* Explicit dependencies */
+	cl_int * ret_num_events,	/* Returned number of dependencies */
+	cl_event ** ret_events	/* Returned dependencies */
+) {
+	cl_int implicit_num_events;
+	cl_event * implicit_events;
+
+	/* Implicit dependencies */
+	command_queue_dependencies_implicit(cq, is_barrier, &implicit_num_events, &implicit_events);
+
+	/* Explicit dependencies */
+	cl_int ndeps = implicit_num_events + num_events;
+	cl_event * evs = malloc(sizeof(cl_event) * ndeps);
+	memcpy(evs, implicit_events, sizeof(cl_event) * implicit_num_events);
+	memcpy(&evs[implicit_num_events], events, sizeof(cl_event) * num_events);
+
+	*ret_num_events = ndeps;
+	*ret_events = evs;
+}
 
-  /* Register this event as last barrier */
-  if (barrier || in_order)
-    cq->barrier = ev;
+/**
+ * Enqueue the given task and put ev into the command queue.
+ */
+void command_queue_enqueue(
+	cl_command_queue cq, 		/* Command queue */
+	cl_event ev,			/* Event triggered on task completion (can be NULL if task event should be used)*/
+	cl_int is_barrier,			/* True if the task acts as a barrier */
+	cl_int num_events,		/* Number of dependencies */
+	const cl_event * events,	/* Dependencies */
+	cl_int * ret_num_events,	/* Returned number of events */
+	cl_event ** ret_events		/* Returned events */
+	) {
 
-   /* Unlock command queue */
-   pthread_spin_unlock(&cq->spin);
+	/* Lock command queue */
+	pthread_spin_lock(&cq->spin);
 
-   /* Add reference to the command queue */
-   gc_entity_store(&ev->cq, cq);
+	command_queue_dependencies(cq, is_barrier, num_events, events, ret_num_events, ret_events);
 
-   /* Submit task */
-   gc_entity_retain(task_event(task));
-   int ret = starpu_task_submit(task);
-   if (ret != 0)
-      DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
+	command_queue_insert(cq, ev, is_barrier);
 
-   return CL_SUCCESS;
+	/* Unlock command queue */
+	pthread_spin_unlock(&cq->spin);
 }
 
-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events) {
-  return command_queue_enqueue_fakeevent(cq, task, barrier, num_events, events, NULL);
-}
 
+cl_event command_queue_barrier(cl_command_queue cq) {
+
+	cl_int ndeps;
+	cl_event *deps;
 
-cl_event enqueueBarrier(cl_command_queue cq) {
+	//CL_COMMAND_MARKER has been chosen as CL_COMMAND_BARRIER doesn't exist
+	starpu_task * task = task_create(CL_COMMAND_MARKER);
 
-   //CL_COMMAND_MARKER has been chosen as CL_COMMAND_BARRIER doesn't exist
-   starpu_task * task = task_create(CL_COMMAND_MARKER);
+	DEBUG_MSG("Submitting barrier task (event %d)\n", task->tag_id);
+	command_queue_enqueue(cq, task_event(task), 1, 0, NULL, &ndeps, &deps);
 
-   DEBUG_MSG("Submitting barrier task (event %d)\n", task->tag_id);
-   command_queue_enqueue(cq, task, 1, 0, NULL);
+	task_submit(task, ndeps, deps);
 
-   return task_event(task);
+	return task_event(task);
 }

+ 10 - 4
socl/src/command_queue.h

@@ -17,10 +17,16 @@
 #ifndef SOCL_COMMAND_QUEUE_H
 #define SOCl_COMMAND_QUEUE_H
 
-cl_int command_queue_enqueue(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events);
+void command_queue_enqueue(
+	cl_command_queue cq, 		/* Command queue */
+	cl_event ev,			/* Event triggered on task completion (can be NULL if task event should be used)*/
+	cl_int is_barrier,			/* True if the task acts as a barrier */
+	cl_int num_events,		/* Number of dependencies */
+	const cl_event * events,	/* Dependencies */
+	cl_int * ret_num_events,	/* Returned number of events */
+	cl_event ** ret_events		/* Returned events */
+	);
 
-cl_int command_queue_enqueue_fakeevent(cl_command_queue cq, starpu_task *task, cl_int barrier, cl_int num_events, const cl_event * events, cl_event fake_event);
-
-cl_event enqueueBarrier(cl_command_queue cq);
+cl_event command_queue_barrier(cl_command_queue cq);
 
 #endif /* SOCl_COMMAND_QUEUE_H */

+ 14 - 22
socl/src/graph.c

@@ -42,6 +42,7 @@ void graph_destroy(void) {
 void graph_node_init(graph_node node) {
 	node->id = -1;
 	node->next = NULL;
+	node->event = event_create();
 }
 
 /**
@@ -57,20 +58,18 @@ void graph_store(void * node) {
 	pthread_spin_unlock(&graph_lock);
 }
 
-
-
 /**
- * Duplicate a memory area into a fresh allocated buffer
+ * Free a node
  */
-static void * memdupa(const void *p, size_t size) {
-	void * s = malloc(size);
-	memcpy(s,p,size);
-	return s;
+void graph_free(void * node) {
+	free(node);
 }
 
-#define memdup(p, size) ((typeof(p))memdupa(p,size))
-#define nullOrDup(name,size) s->name = (name == NULL ? NULL : memdup(name,size))
+
+#define nullOrDup(name,size) s->name = memdup_safe(name,size)
+#define nodeNullOrDup(name,size) s->node.name = memdup_safe(name,size)
 #define dup(name) s->name = name
+#define nodeDup(name) s->node.name = name
 
 
 node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
@@ -82,7 +81,6 @@ node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
 		const size_t *   local_work_size,
 		cl_uint          num_events,
 		const cl_event * events,
-		cl_event *       event,
 		cl_uint 		num_args,
 		size_t *		arg_sizes,
 		enum kernel_arg_type * arg_types,
@@ -92,6 +90,9 @@ node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
 	graph_node_init(&s->node);
 	s->node.id = NODE_ENQUEUE_KERNEL;
 
+	nodeDup(num_events);
+	nodeNullOrDup(events, num_events * sizeof(cl_event));
+
 	dup(is_task);
 	dup(cq);
 	dup(kernel);
@@ -99,25 +100,16 @@ node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
 	nullOrDup(global_work_offset, work_dim*sizeof(size_t));
 	nullOrDup(global_work_size, work_dim*sizeof(size_t));
 	nullOrDup(local_work_size, work_dim*sizeof(size_t));
-	dup(num_events);
-	nullOrDup(events, num_events * sizeof(cl_event));
 	dup(num_args);
 	nullOrDup(arg_sizes, num_args * sizeof(size_t));
 	nullOrDup(arg_types, num_args * sizeof(enum kernel_arg_type));
 	nullOrDup(args, num_args * sizeof(void*));
 
-	
-	if (event != NULL) {
-		*event = event_create();
-		s->event = event;
-	}
-	else {
-		s->event = NULL;
-	}
-
 	return s;
 }
 
 #undef nullOrDup
-#undef memdup
+#undef nodeNullOrDup
 #undef dup
+#undef nodeDup
+#undef memdup

+ 9 - 9
socl/src/graph.h

@@ -22,14 +22,18 @@
 typedef struct graph_node_t * graph_node;
 
 struct graph_node_t {
-	int id; /* Kind of node */
-	graph_node next; /* Linked-list of nodes... */
+	int 		id; 		/* Kind of node */
+	graph_node 	next; 		/* Linked-list of nodes... */
+	cl_uint 	num_events;	/* Number of dependencies */
+	cl_event * 	events;		/* Dependencies */
+	cl_event  	event;		/* Event for this node */
 };
 
 void graph_init(void);
 void graph_destroy(void);
 void graph_node_init(graph_node node);
 void graph_store(void * node);
+void graph_free(void * node);
 
 #define NODE_ENQUEUE_KERNEL 1
 
@@ -44,9 +48,6 @@ typedef struct node_enqueue_kernel_t {
 	const size_t *   global_work_offset;
 	const size_t *   global_work_size;
 	const size_t *   local_work_size;
-	cl_uint          num_events;
-	const cl_event * events;
-	cl_event * 	 event;
 	cl_uint 	 num_args;
 	size_t *	 arg_sizes;
 	enum kernel_arg_type * arg_types;
@@ -62,12 +63,11 @@ node_enqueue_kernel graph_create_enqueue_kernel(char is_task,
 		const size_t *   local_work_size,
 		cl_uint          num_events,
 		const cl_event * events,
-		cl_event *       event,
-		cl_uint 		num_args,
-		size_t *		arg_sizes,
+		cl_uint		 num_args,
+		size_t *	 arg_sizes,
 		enum kernel_arg_type * arg_types,
 		void **		args);
 
-cl_int node_play_enqueue_kernel(node_enqueue_kernel n);
+cl_int graph_play_enqueue_kernel(node_enqueue_kernel n);
 
 #endif /* SOCL_GRAPH_H */

+ 1 - 1
socl/src/socl.h

@@ -79,7 +79,7 @@ struct entity {
 
 struct _cl_platform_id {};
 
-#define RETURN_EVENT(ev, event) \
+#define RETURN_OR_RELEASE_EVENT(ev, event) \
    if (event != NULL) \
       *event = ev; \
    else\

+ 21 - 1
socl/src/task.c

@@ -45,10 +45,17 @@ static void task_release_callback(void *arg) {
  */
 starpu_task * task_create(cl_command_type type) {
    cl_event event;
-   struct starpu_task * task;
 
    /* Create event */
    event = event_create();
+
+   return task_create_with_event(type, event);
+}
+
+
+starpu_task * task_create_with_event(cl_command_type type, cl_event event) {
+   struct starpu_task * task;
+
    event->type = type;
 
    /* Create StarPU task */
@@ -78,6 +85,19 @@ void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events
    }
 }
 
+cl_int task_submit(starpu_task * task, cl_int num_events, cl_event * events) {
+
+	task_dependency_add(task, num_events, events);
+
+	/* Submit task */
+	int ret = starpu_task_submit(task);
+	gc_entity_retain(task_event(task));
+	if (ret != 0)
+		DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
+
+	return CL_SUCCESS;
+}
+
 
 /*********************************
  * CPU task helper

+ 10 - 0
socl/src/task.h

@@ -20,8 +20,18 @@
 #include "socl.h"
 
 starpu_task * task_create(cl_command_type type);
+starpu_task * task_create_with_event(cl_command_type type, cl_event event);
 void task_dependency_add(starpu_task * task, cl_uint num, const cl_event *events);
 starpu_task * task_create_cpu(cl_command_type type, void (*callback)(void*), void *arg, int free_arg);
+
+/** 
+ * Return event associated to a task
+ */
 cl_event task_event(starpu_task *task);
 
+/**
+ * Submit "task" with "events" dependencies
+ */
+cl_int task_submit(starpu_task * task, cl_int num_events, cl_event * events);
+
 #endif /* SOCL_TASK_H */

+ 24 - 0
socl/src/util.c

@@ -24,3 +24,27 @@ int starpu_worker_get_range() {
 
    return oid;
 }
+
+void * memdupa(const void *p, size_t size) {
+	void * s = malloc(size);
+	memcpy(s,p,size);
+	return s;
+}
+
+void ** memdup_deep_safea(const void **p, unsigned n, size_t size) {
+	void ** s = (void**)malloc(sizeof(void*) * n);
+	unsigned i;
+	for (i=0; i<n; i++) {
+		s[i] = memdup_safe((void*)p[i], size);
+	}
+	return s;
+}
+
+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size) {
+	void ** s = (void**)malloc(sizeof(void*) * n);
+	unsigned i;
+	for (i=0; i<n; i++) {
+		s[i] = memdup_safe((void*)p[i], size[i]);
+	}
+	return s;
+}

+ 24 - 0
socl/src/util.h

@@ -19,4 +19,28 @@
 
 int starpu_worker_get_range();
 
+/**
+ * Duplicate a memory area into a fresh allocated buffer
+ * Consider using memdup or memdup_safe instead
+ */
+void * memdupa(const void *p, size_t size);
+
+#define memdup(p, size) ((typeof(p))memdupa((const void*)p,size))
+#define memdup_safe(p,size) (p == NULL ? NULL : memdup(p,size))
+
+/**
+ * Duplicate an array of pointers by performing a deep copy
+ */
+void ** memdup_deep_safea(const void **p, unsigned n, size_t size);
+
+#define memdup_deep_safe(p,n,size) ((typeof(p))memdup_deep_safea((const void **)p,n,size))
+
+/**
+ * Duplicate an array of pointers by performing a deep copy
+ * Sizes are different for each cell
+ */
+void ** memdup_deep_varsize_safea(const void **p, unsigned n, size_t * size);
+
+#define memdup_deep_varsize_safe(p,n,size) ((typeof(p))memdup_deep_varsize_safea((const void **)p,n,size))
+
 #endif /* SOCL_UTIL_H */