Browse Source

SOCL: enhanced entity management

* SOCL does not require StarPU CPU tasks anymore. CPU workers are automatically disabled to enhance performance of OpenCL CPU devices
* In debug mode, entities that have not been released are listed on program termination
Sylvain Henry 12 years ago
parent
commit
75ae9dba3b

+ 1 - 1
socl/src/cl_createbuffer.c

@@ -85,7 +85,7 @@ soclCreateBuffer(cl_context   context,
 
 
    //Alloc cl_mem structure
-   mem = (cl_mem)gc_entity_alloc(sizeof(struct _cl_mem), release_callback_memobject);
+   mem = (cl_mem)gc_entity_alloc(sizeof(struct _cl_mem), release_callback_memobject, "buffer");
    if (mem == NULL) {
       if (errcode_ret != NULL)
          *errcode_ret = CL_OUT_OF_HOST_MEMORY;

+ 1 - 1
socl/src/cl_createcommandqueue.c

@@ -44,7 +44,7 @@ soclCreateCommandQueue(cl_context                   context,
    cl_command_queue cq;
 
    cq = (cl_command_queue)gc_entity_alloc(sizeof(struct _cl_command_queue),
-                                          release_callback_command_queue);
+                                          release_callback_command_queue, "command_queue");
    if (cq == NULL) {
       if (errcode_ret != NULL)
          *errcode_ret = CL_OUT_OF_HOST_MEMORY;

+ 1 - 1
socl/src/cl_createcontext.c

@@ -77,7 +77,7 @@ soclCreateContext(const cl_context_properties * properties,
 
 
    cl_context ctx;
-   ctx = (cl_context)gc_entity_alloc(sizeof(struct _cl_context), release_callback_context);
+   ctx = (cl_context)gc_entity_alloc(sizeof(struct _cl_context), release_callback_context, "context");
    if (ctx == NULL) {
       if (errcode_ret != NULL)
          *errcode_ret = CL_OUT_OF_HOST_MEMORY;

+ 1 - 3
socl/src/cl_createkernel.c

@@ -68,9 +68,7 @@ static void release_callback_kernel(void * e) {
   for (i=0; i<kernel->num_args; i++) {
     switch (kernel->arg_type[i]) {
       case Null:
-        break;
       case Buffer:
-        gc_entity_unstore((cl_mem*)&kernel->arg_value[i]);
         break;
       case Immediate:
         free(kernel->arg_value[i]);
@@ -119,7 +117,7 @@ soclCreateKernel(cl_program    program,
    //TODO: check programs (see opencl specs)
 
    /* Create Kernel structure */
-   k = (cl_kernel)gc_entity_alloc(sizeof(struct _cl_kernel), release_callback_kernel);
+   k = (cl_kernel)gc_entity_alloc(sizeof(struct _cl_kernel), release_callback_kernel, "kernel");
    if (k == NULL) {
       if (errcode_ret != NULL)
          *errcode_ret = CL_OUT_OF_HOST_MEMORY;

+ 1 - 1
socl/src/cl_createprogramwithsource.c

@@ -83,7 +83,7 @@ soclCreateProgramWithSource(cl_context      context,
    }
 
    /* Alloc cl_program structure */
-   p = (cl_program)gc_entity_alloc(sizeof(struct _cl_program), release_callback_program);
+   p = (cl_program)gc_entity_alloc(sizeof(struct _cl_program), release_callback_program, "program");
    if (p == NULL) {
       if (errcode_ret != NULL)
          *errcode_ret = CL_OUT_OF_HOST_MEMORY;

+ 10 - 4
socl/src/cl_enqueuecopybuffer.c

@@ -31,6 +31,8 @@ static void soclEnqueueCopyBuffer_opencl_task(void *descr[], void *args) {
    clEnqueueCopyBuffer(cq, src,dst, cmd->src_offset, cmd->dst_offset, cmd->cb, 0, NULL, &ev);
    clWaitForEvents(1, &ev);
    clReleaseEvent(ev);
+
+   gc_entity_release_cmd(cmd);
 }
 
 static void soclEnqueueCopyBuffer_cpu_task(void *descr[], void *args) {
@@ -40,6 +42,8 @@ static void soclEnqueueCopyBuffer_cpu_task(void *descr[], void *args) {
    void * dst = (void*)STARPU_VARIABLE_GET_PTR(descr[1]);
 
    memcpy(dst+cmd->dst_offset, src+cmd->src_offset, cmd->cb);
+
+   gc_entity_release_cmd(cmd);
 }
 
 static struct starpu_perfmodel copy_buffer_perfmodel = {
@@ -65,12 +69,12 @@ cl_int command_copy_buffer_submit(command_copy_buffer cmd) {
 	task->cl = &codelet_copybuffer;
 
 	/* Execute the task on a specific worker? */
-	if (cmd->_command.cq->device != NULL) {
+	if (cmd->_command.event->cq->device != NULL) {
 	  task->execute_on_a_specific_worker = 1;
-	  task->workerid = cmd->_command.cq->device->worker_id;
+	  task->workerid = cmd->_command.event->cq->device->worker_id;
 	}
 
-	task->cl_arg = cmd;
+	gc_entity_store_cmd(&task->cl_arg, cmd);
 	task->cl_arg_size = sizeof(*cmd);
 
 	cmd->dst_buffer->scratch = 0;
@@ -94,9 +98,11 @@ soclEnqueueCopyBuffer(cl_command_queue  cq,
 {
 	command_copy_buffer cmd = command_copy_buffer_create(src_buffer, dst_buffer, src_offset, dst_offset, cb);
 
+   cl_event ev = command_event_get(cmd);
+
 	command_queue_enqueue(cq, cmd, num_events, events);
 
-	RETURN_EVENT(cmd, event);
+	RETURN_EVENT(ev, event);
 
 	return CL_SUCCESS;
 }

+ 8 - 14
socl/src/cl_enqueuemapbuffer.c

@@ -16,19 +16,12 @@
 
 #include "socl.h"
 
-static void mapbuffer_callback(void *args) {
-	command_map_buffer cmd = (command_map_buffer)args;
-
-	starpu_tag_notify_from_apps(cmd->event->id);
-	cmd->event->status = CL_COMPLETE;
-}
-
 static void mapbuffer_task(void *args) {
 	command_map_buffer cmd = (command_map_buffer)args;
 
 	enum starpu_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
 
-	starpu_data_acquire_cb(cmd->buffer->handle, mode, mapbuffer_callback, cmd);
+	starpu_data_acquire_cb(cmd->buffer->handle, mode, command_completed_task_callback, cmd);
 }
 
 cl_int command_map_buffer_submit(command_map_buffer cmd) {
@@ -36,7 +29,9 @@ cl_int command_map_buffer_submit(command_map_buffer cmd) {
 		.name = "SOCL_MAP_BUFFER"
 	};
 
-	cpu_task_submit(cmd, mapbuffer_task, cmd, 0, &codelet, 0, NULL);
+   gc_entity_retain(cmd);
+
+	cpu_task_submit(cmd, mapbuffer_task, cmd, 0, 0, &codelet, 0, NULL);
 
 	return CL_SUCCESS;
 }
@@ -53,18 +48,17 @@ soclEnqueueMapBuffer(cl_command_queue cq,
                    cl_event *       event,
                    cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0
 {
-	cl_event ev = event_create();
 
-	command_map_buffer cmd = command_map_buffer_create(buffer, map_flags, offset, cb, ev);
+	command_map_buffer cmd = command_map_buffer_create(buffer, map_flags, offset, cb);
+
+   cl_event ev = command_event_get(cmd);
 
 	command_queue_enqueue(cq, cmd, num_events, events);
 
 	if (errcode_ret != NULL)
 		*errcode_ret = CL_SUCCESS;
 
-	RETURN_CUSTOM_EVENT(ev,event);
+	MAY_BLOCK_THEN_RETURN_EVENT(ev,blocking,event);
 
-	MAY_BLOCK_CUSTOM(blocking,ev);
-	
 	return (void*)(starpu_variable_get_local_ptr(buffer->handle) + offset);
 }

+ 3 - 1
socl/src/cl_enqueuemarker.c

@@ -25,9 +25,11 @@ soclEnqueueMarker(cl_command_queue  cq,
 	
 	command_marker cmd = command_marker_create();
 
+   cl_event ev = command_event_get(cmd);
+
 	command_queue_enqueue(cq, cmd, 0, NULL);
 
-	RETURN_EVENT(cmd, event);
+	RETURN_EVENT(ev, event);
 
 	return CL_SUCCESS;
 }

+ 5 - 46
socl/src/cl_enqueuendrangekernel.c

@@ -80,40 +80,6 @@ void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
    }
 }
 
-static void cleaning_task_callback(void *args) {
-	command_ndrange_kernel cmd = (command_ndrange_kernel)args;
-
-	free(cmd->arg_sizes);
-	free(cmd->arg_types);
-
-	unsigned int i;
-	for (i=0; i<cmd->num_args; i++) {
-		free(cmd->args[i]);
-	}
-	free(cmd->args);
-
-	for (i=0; i<cmd->num_buffers; i++)
-		gc_entity_unstore(&cmd->buffers[i]);
-
-	free(cmd->buffers);
-
-	if (cmd->global_work_offset != NULL) {
-	  free((void*)cmd->global_work_offset);
-	  cmd->global_work_offset = NULL;
-	}
-
-	if (cmd->global_work_size != NULL) {
-	  free((void*)cmd->global_work_size);
-	  cmd->global_work_size = NULL;
-	}
-
-	if (cmd->local_work_size != NULL) {
-	  free((void*)cmd->local_work_size);
-	  cmd->local_work_size = NULL;
-	}
-
-}
-
 /**
  * Real kernel enqueuing command
  */
@@ -126,9 +92,9 @@ cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd) {
 	task->cl_arg_size = sizeof(cmd);
 
 	/* Execute the task on a specific worker? */
-	if (cmd->_command.cq->device != NULL) {
+	if (cmd->_command.event->cq->device != NULL) {
 	  task->execute_on_a_specific_worker = 1;
-	  task->workerid = cmd->_command.cq->device->worker_id;
+	  task->workerid = cmd->_command.event->cq->device->worker_id;
 	}
 
 	struct starpu_codelet * codelet = task->cl;
@@ -172,15 +138,6 @@ cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd) {
 
 	task_submit(task, cmd);
 
-	/* Enqueue a cleaning task */
-	//FIXME: execute this in the callback?
-	cl_event ev = command_event_get(cmd);
-
-	static struct starpu_codelet cdl = {
-		.name = "SOCL_NDRANGE_CLEANING_TASK"
-	};
-	cpu_task_submit(cmd, cleaning_task_callback, cmd, 0, &cdl, 1, &ev);
-
 	return CL_SUCCESS;
 }
 
@@ -199,9 +156,11 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 	command_ndrange_kernel cmd = command_ndrange_kernel_create(kernel, work_dim,
 			global_work_offset, global_work_size, local_work_size);
 
+   cl_event ev = command_event_get(cmd);
+
 	command_queue_enqueue(cq, cmd, num_events, events);
 
-	RETURN_EVENT(cmd, event);
+	RETURN_EVENT(ev, event);
 
 	return CL_SUCCESS;
 }

+ 9 - 6
socl/src/cl_enqueuereadbuffer.c

@@ -26,6 +26,8 @@ static void soclEnqueueReadBuffer_cpu_task(void *descr[], void *args) {
    //They should use buffer mapping facilities instead.
    if (ptr+cmd->offset != cmd->ptr)
       memcpy(cmd->ptr, ptr+cmd->offset, cmd->cb);
+
+   gc_entity_release_cmd(cmd);
 }
 
 static void soclEnqueueReadBuffer_opencl_task(void *descr[], void *args) {
@@ -47,6 +49,7 @@ static void soclEnqueueReadBuffer_opencl_task(void *descr[], void *args) {
    clWaitForEvents(1, &ev);
    clReleaseEvent(ev);
 
+   gc_entity_release_cmd(cmd);
 }
 
 static struct starpu_perfmodel read_buffer_perfmodel = {
@@ -71,12 +74,12 @@ cl_int command_read_buffer_submit(command_read_buffer cmd) {
 	task->cl = &codelet_readbuffer;
 
 	/* Execute the task on a specific worker? */
-	if (cmd->_command.cq->device != NULL) {
+	if (cmd->_command.event->cq->device != NULL) {
 	  task->execute_on_a_specific_worker = 1;
-	  task->workerid = cmd->_command.cq->device->worker_id;
+	  task->workerid = cmd->_command.event->cq->device->worker_id;
 	}
 
-	task->cl_arg = cmd;
+	gc_entity_store_cmd(&task->cl_arg, cmd);
 	task->cl_arg_size = sizeof(*cmd);
 
 	task_submit(task, cmd);
@@ -99,11 +102,11 @@ soclEnqueueReadBuffer(cl_command_queue  cq,
 
 	command_read_buffer cmd = command_read_buffer_create(buffer, offset, cb, ptr);
 
-	command_queue_enqueue(cq, cmd, num_events, events);
+   cl_event ev = command_event_get(cmd);
 
-	RETURN_EVENT(cmd, event);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-	MAY_BLOCK(blocking);
+	MAY_BLOCK_THEN_RETURN_EVENT(ev, blocking, event);
 
 	return CL_SUCCESS;
 }

+ 3 - 1
socl/src/cl_enqueuetask.c

@@ -24,10 +24,12 @@ soclEnqueueTask(cl_command_queue cq,
               cl_event *        event) CL_API_SUFFIX__VERSION_1_0
 {
 	command_ndrange_kernel cmd = command_task_create(kernel);
+
+   cl_event ev = command_event_get(cmd);
 	
 	command_queue_enqueue(cq, cmd, num_events, events);
 
-	RETURN_EVENT(cmd, event);
+	RETURN_EVENT(ev, event);
 
 	return CL_SUCCESS;
 }

+ 4 - 3
socl/src/cl_enqueueunmapmemobject.c

@@ -20,11 +20,10 @@ cl_int command_unmap_mem_object_submit(command_unmap_mem_object cmd) {
 	/* Aliases */
 	cl_mem buffer = cmd->buffer;
 
-	//FIXME: use a callback
 	static struct starpu_codelet codelet = {
 		.name = "SOCL_UNMAP_MEM_OBJECT"
 	};
-	cpu_task_submit(cmd, (void(*)(void*))starpu_data_release, buffer->handle, 0, &codelet, 0, NULL);
+	cpu_task_submit(cmd, (void(*)(void*))starpu_data_release, buffer->handle, 0, 1, &codelet, 0, NULL);
 
 	return CL_SUCCESS;
 }
@@ -39,9 +38,11 @@ soclEnqueueUnmapMemObject(cl_command_queue cq,
 {
 	command_unmap_mem_object cmd = command_unmap_mem_object_create(buffer, ptr);
 
+   cl_event ev = command_event_get(cmd);
+
 	command_queue_enqueue(cq, cmd, num_events, events);
 
-	RETURN_EVENT(cmd, event);
+	RETURN_EVENT(ev, event);
 
 	return CL_SUCCESS;
 }

+ 10 - 6
socl/src/cl_enqueuewritebuffer.c

@@ -29,6 +29,8 @@ static void soclEnqueueWriteBuffer_cpu_task(void *descr[], void *args) {
    // Maybe we should report the bug here... for now, we just avoid memcpy crash due to overlapping regions...
    if (ptr+cmd->offset != cmd->ptr)
       memcpy(ptr+cmd->offset, cmd->ptr, cmd->cb);
+
+   gc_entity_release_cmd(cmd);
 }
 
 static void soclEnqueueWriteBuffer_opencl_task(void *descr[], void *args) {
@@ -50,6 +52,8 @@ static void soclEnqueueWriteBuffer_opencl_task(void *descr[], void *args) {
 
    clWaitForEvents(1, &ev);
    clReleaseEvent(ev);
+
+   gc_entity_release_cmd(cmd);
 }
 
 static struct starpu_perfmodel write_buffer_perfmodel = {
@@ -91,13 +95,13 @@ cl_int command_write_buffer_submit(command_write_buffer cmd) {
 	else 
 		task->cl = &codelet_writebuffer;
 
-	task->cl_arg = cmd;
+	gc_entity_store_cmd(&task->cl_arg, cmd);
 	task->cl_arg_size = sizeof(*cmd);
 
 	/* Execute the task on a specific worker? */
-	if (cmd->_command.cq->device != NULL) {
+	if (cmd->_command.event->cq->device != NULL) {
 	  task->execute_on_a_specific_worker = 1;
-	  task->workerid = cmd->_command.cq->device->worker_id;
+	  task->workerid = cmd->_command.event->cq->device->worker_id;
 	}
 
 	//The buffer now contains meaningful data
@@ -121,11 +125,11 @@ soclEnqueueWriteBuffer(cl_command_queue cq,
 { 
 	command_write_buffer cmd = command_write_buffer_create(buffer, offset, cb, ptr);
 
-	command_queue_enqueue(cq, cmd, num_events, events);
+   cl_event ev = command_event_get(cmd);
 
-	RETURN_EVENT(cmd, event);
+	command_queue_enqueue(cq, cmd, num_events, events);
 
-	MAY_BLOCK(blocking);
+	MAY_BLOCK_THEN_RETURN_EVENT(ev, blocking, event);
 
 	return CL_SUCCESS;
 }

+ 3 - 1
socl/src/cl_finish.c

@@ -21,9 +21,11 @@ soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0 {
 
 	command_barrier cmd = command_barrier_create();
 
+   cl_event ev = command_event_get(cmd);
+
 	command_queue_enqueue(cq, cmd, 0, NULL);
 
-	MAY_BLOCK(CL_TRUE)
+	MAY_BLOCK_THEN_RETURN_EVENT(ev, CL_TRUE, (cl_event*)NULL);
 
 	return CL_SUCCESS;
 }

+ 2 - 3
socl/src/cl_setkernelarg.c

@@ -38,8 +38,7 @@ soclSetKernelArg(cl_kernel  kernel,
          break;
       case Buffer:
          kernel->arg_type[arg_index] = Null;
-         gc_entity_unstore((cl_mem*)kernel->arg_value[arg_index]);
-	 free(kernel->arg_value[arg_index]);
+         free(kernel->arg_value[arg_index]);
          kernel->arg_value[arg_index] = NULL;
          break;
       case Immediate:
@@ -62,7 +61,7 @@ soclSetKernelArg(cl_kernel  kernel,
          DEBUG_MSG("Found buffer %d \n", buf->id);
          kernel->arg_type[arg_index] = Buffer;
          kernel->arg_value[arg_index] = malloc(sizeof(void*));
-	 gc_entity_store((cl_mem*)kernel->arg_value[arg_index], buf);
+         *(cl_mem*)kernel->arg_value[arg_index] = buf; //We do not use gc_entity_store here because kernels do not hold reference on buffers (see OpenCL spec)
       }
       else {
          /* Argument must be an immediate buffer  */

+ 6 - 0
socl/src/cl_waitforevents.c

@@ -22,6 +22,12 @@ soclWaitForEvents(cl_uint           num_events,
 {
    unsigned int i;
    DEBUG_MSG("Waiting for events: ");
+   for (i=0; i<num_events; i++) {
+      char * sep = i == (num_events-1) ? "" : ", ";
+      DEBUG_MSG_NOHEAD("%d%s", event_list[i]->id, sep);
+   }
+   DEBUG_MSG_NOHEAD("\n");
+
    for (i=0; i<num_events; i++)
       starpu_tag_wait(event_list[i]->id);
 

+ 56 - 13
socl/src/command.c

@@ -20,19 +20,61 @@
 /* Forward extern declaration */
 extern void soclEnqueueNDRangeKernel_task(void *descr[], void *args);
 
+cl_event command_event_get_ex(cl_command cmd) {
+   cl_event ev = cmd->event;
+   gc_entity_retain(ev);
+   return ev;
+}
+
+static void command_release_callback(void *a) {
+  cl_command cmd = (cl_command)a;
+
+  // Call command specific release callback
+  if (cmd->release_callback != NULL)
+     cmd->release_callback(cmd);
+
+  // Generic command destructor
+  cl_uint i;
+  for (i=0; i<cmd->num_events; i++) {
+     gc_entity_unstore(&cmd->events[i]);
+  }
+  cmd->num_events = 0;
+  free(cmd->events);
+
+  /* Remove from command queue */
+  cl_command_queue cq = cmd->event->cq;
+  if (cq != NULL) {
+    /* Lock command queue */
+    pthread_mutex_lock(&cq->mutex);
+
+    /* Remove barrier if applicable */
+    if (cq->barrier == cmd)
+      cq->barrier = NULL;
+
+    /* Remove from the list of out-of-order commands */
+    cq->commands = command_list_remove(cq->commands, cmd);
+
+    /* Unlock command queue */
+    pthread_mutex_unlock(&cq->mutex);
+  }
+
+  // Events may survive to commands that created them
+  cmd->event->command = NULL;
+  gc_entity_unstore(&cmd->event); 
+}
+
 void command_init_ex(cl_command cmd, cl_command_type typ, void (*cb)(void*)) {
-	gc_entity_init(&cmd->_entity, cb);
+	gc_entity_init(&cmd->_entity, command_release_callback, "command");
+   cmd->release_callback = cb;
 	cmd->typ = typ;
 	cmd->num_events = 0;
 	cmd->events = NULL;
-	cmd->event = event_create();
+	cmd->event = event_create(); // we do not use gc_entity_store here because if nobody requires the event, it should be destroyed with the command
 	cmd->event->command = cmd;
-	cmd->cq = NULL;
 	cmd->task = NULL;
 	cmd->submitted = 0;
 }
 
-
 void command_submit_ex(cl_command cmd) {
 #define SUBMIT(typ,name) case typ:\
 	name##_submit((name)cmd);\
@@ -115,9 +157,16 @@ void command_ndrange_kernel_release(void * arg) {
 	free(cmd->arg_sizes);
 	free(cmd->arg_types);
 	unsigned int i;
-	for (i=0; i<cmd->num_args; i++)
+	for (i=0; i<cmd->num_args; i++) {
 		free(cmd->args[i]);
+      cmd->args[i] = NULL;
+   }
 	free(cmd->args);
+
+	for (i=0; i<cmd->num_buffers; i++)
+		gc_entity_unstore(&cmd->buffers[i]);
+
+	free(cmd->buffers);
 }
 
 command_ndrange_kernel command_ndrange_kernel_create (
@@ -185,21 +234,16 @@ command_marker command_marker_create () {
 	return cmd;
 }
 
-void command_map_buffer_release(void * arg) {
-	command_map_buffer cmd = (command_map_buffer)arg;
-
+void command_map_buffer_release(void * UNUSED(arg)) {
 	/* We DO NOT unstore (release) the buffer as unmap will do it
 	  gc_entity_unstore(&cmd->buffer); */
-
-	gc_entity_unstore(&cmd->event);
 }
 
 command_map_buffer command_map_buffer_create(
 		cl_mem buffer,
 		cl_map_flags map_flags,
 		size_t offset,
-		size_t cb,
-		cl_event event
+		size_t cb
 		) {
 
 	command_map_buffer cmd = malloc(sizeof(struct command_map_buffer_t));
@@ -209,7 +253,6 @@ command_map_buffer command_map_buffer_create(
 	dup(map_flags);
 	dup(offset);
 	dup(cb);
-	gc_entity_store(&cmd->event, event);
 
 	return cmd;
 }

+ 17 - 9
socl/src/command.h

@@ -21,6 +21,9 @@
 
 typedef struct cl_command_t * cl_command;
 
+#define gc_entity_store_cmd(dest,cmd) gc_entity_store(dest, &cmd->_command)
+#define gc_entity_release_cmd(cmd) gc_entity_release(&cmd->_command)
+
 /**
  * Initialize a command structure
  *
@@ -31,6 +34,8 @@ void command_init_ex(cl_command cmd, cl_command_type typ, void (*cb)(void*));
 #define command_init(cmd,typ,cb) \
 	command_init_ex((cl_command)cmd,typ,cb)
 
+void command_release(cl_command cmd);
+
 /** Submit a command for execution */
 void command_submit_ex(cl_command cmd);
 #define command_submit(cmd) \
@@ -52,17 +57,22 @@ struct cl_command_t {
 	cl_uint 	num_events;	/* Number of dependencies */
 	cl_event * 	events;		/* Dependencies */
 	cl_event  	event;		/* Event for this command */
-	cl_command_queue cq;		/* Command queue the command is enqueued in */
 	starpu_task	task;		/* Associated StarPU task, if any */
 	char		submitted;	/* True if the command has been submitted to StarPU */
+   void (*release_callback)(void*); /* Command specific destructor */
 };
 
 #define command_type_get(cmd) (((cl_command)cmd)->typ)
-#define command_event_get(cmd) (((cl_command)cmd)->event)
-#define command_num_events_get(cmd) (((cl_command)cmd)->num_events)
-#define command_events_get(cmd) (((cl_command)cmd)->events)
-#define command_task_get(cmd) (((cl_command)cmd)->task)
-#define command_cq_get(cmd) (((cl_command)cmd)->cq)
+
+cl_event command_event_get_ex(cl_command cmd);
+#define command_event_get(cmd) command_event_get_ex(&cmd->_command)
+
+#define command_num_events_get_ex(cmd) (cmd->num_events)
+#define command_num_events_get(cmd) ((cmd)->_command.num_events)
+#define command_events_get_ex(cmd) ((cmd)->events)
+#define command_events_get(cmd) ((cmd)->_command.events)
+#define command_task_get(cmd) ((cmd)->_command.task)
+#define command_cq_get(cmd) ((cmd)->_command.cq)
 
 #define CL_COMMAND struct cl_command_t _command;
 
@@ -122,7 +132,6 @@ typedef struct command_map_buffer_t {
 	cl_map_flags map_flags;
 	size_t offset;
 	size_t cb;
-	cl_event event;
 } * command_map_buffer;
 
 
@@ -163,8 +172,7 @@ command_map_buffer command_map_buffer_create(
 		cl_mem buffer,
 		cl_map_flags map_flags,
 		size_t offset,
-		size_t cb,
-		cl_event event);
+		size_t cb);
 
 command_unmap_mem_object command_unmap_mem_object_create(
 		cl_mem buffer,

+ 11 - 9
socl/src/command_queue.c

@@ -33,7 +33,6 @@ void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_e
 	int is_barrier = (cmd->typ == CL_COMMAND_BARRIER || !(cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE));
 
 	/* Add references to the command queue */
-	gc_entity_store(&cmd->cq, cq);
 	gc_entity_store(&cmd->event->cq, cq);
 
 	/* Lock command queue */
@@ -61,13 +60,14 @@ void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_e
 	int n = 0;
 
 	/* Add dependency to last barrier if applicable */
-	if (cq->barrier != NULL) deps[n++] = cq->barrier->event;
+	if (cq->barrier != NULL) 
+      gc_entity_store(&deps[n++], cq->barrier->event);
 
 	/* Add dependencies to out-of-order events (if any) */
 	if (is_barrier) {
 		command_list cl = cq->commands;
 		while (cl != NULL) {
-			deps[n++] = cl->cmd->event;
+			gc_entity_store(&deps[n++], cl->cmd->event);
 			cl = cl->next;
 		}
 	}
@@ -75,16 +75,13 @@ void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_e
 	/* Add explicit dependencies */
 	unsigned i;
 	for (i=0; i<num_events; i++) {
-		deps[n++] = events[i];
+		gc_entity_store(&deps[n++], events[i]);
 	}
 
 	/* Make all dependencies explicit for the command */
 	cmd->num_events = ndeps;
 	cmd->events = deps;
 
-	/* Increment event ref count */
-	gc_entity_retain(cmd->event);
-
 	/* Insert command in the queue */
 	if (is_barrier) {
 		/* Remove out-of-order commands */
@@ -97,9 +94,14 @@ void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_e
 		cq->commands = command_list_cons(cmd, cq->commands);
 	}
 
+	/* Submit command 
+    * We need to do it before unlocking because we don't want events to get
+    * released while we use them to set dependencies
+    */
+   command_submit_ex(cmd);
+
 	/* Unlock command queue */
 	pthread_mutex_unlock(&cq->mutex);
 
-	/* Submit command */
-	command_submit_ex(cmd);
+   gc_entity_release(cmd);
 }

+ 2 - 25
socl/src/event.c

@@ -33,7 +33,7 @@ int event_unique_id() {
  */
 cl_event event_create(void) {
    cl_event ev;
-   ev = gc_entity_alloc(sizeof(struct _cl_event), release_callback_event);
+   ev = gc_entity_alloc(sizeof(struct _cl_event), release_callback_event, "event");
 
    ev->id = event_unique_id();
    ev->status = CL_SUBMITTED;
@@ -47,30 +47,7 @@ cl_event event_create(void) {
 static void release_callback_event(void * e) {
   cl_event event = (cl_event)e;
 
-  cl_command_queue cq = event->cq;
-  cl_command cmd = event->command;
-
-  /* Remove from command queue */
-  if (cq != NULL) {
-    /* Lock command queue */
-    pthread_mutex_lock(&cq->mutex);
-
-    /* Remove barrier if applicable */
-    if (cq->barrier == event->command)
-      cq->barrier = NULL;
-
-    /* Remove from the list of out-of-order commands */
-    cq->commands = command_list_remove(cq->commands, cmd);
-
-    /* Unlock command queue */
-    pthread_mutex_unlock(&cq->mutex);
-
-    gc_entity_unstore(&cq);
-  }
-
-  free(cmd->events);
-  cmd->events = NULL;
-  cmd->num_events = 0;
+  gc_entity_unstore(&event->cq);
 
   /* Destruct object */
   //FIXME

+ 31 - 7
socl/src/gc.c

@@ -106,15 +106,19 @@ void gc_stop(void) {
   pthread_join(gc_thread, NULL);
 }
 
-int gc_entity_release_ex(entity e) {
+int gc_entity_release_ex(entity e, const char * caller) {
 
   /* Decrement reference count */
   int refs = __sync_sub_and_fetch(&e->refs, 1);
 
+  DEBUG_MSG("[%s] Decrementing refcount of %s %p to %d\n", caller, e->name, e, refs);
+
+  assert(refs >= 0);
+
   if (refs != 0)
     return 0;
 
-  DEBUG_MSG("Releasing entity %p\n", e);
+  DEBUG_MSG("[%s] Releasing %s %p\n", caller, e->name, e);
 
   GC_LOCK;
 
@@ -140,13 +144,17 @@ int gc_entity_release_ex(entity e) {
 /**
  * Initialize entity
  */
-void gc_entity_init(void *arg, void (*release_callback)(void*)) {
+void gc_entity_init(void *arg, void (*release_callback)(void*), char * name) {
+
+  DEBUG_MSG("Initializing entity %p (%s)\n", arg, name);
+
   struct entity * e = (entity)arg;
 
   e->dispatch = &socl_master_dispatch;
   e->refs = 1;
   e->release_callback = release_callback;
   e->prev = NULL;
+  e->name = name;
 
   GC_LOCK;
 
@@ -161,17 +169,19 @@ void gc_entity_init(void *arg, void (*release_callback)(void*)) {
 /**
  * Allocate and initialize entity
  */
-void * gc_entity_alloc(unsigned int size, void (*release_callback)(void*)) {
+void * gc_entity_alloc(unsigned int size, void (*release_callback)(void*), char * name) {
   void * e = malloc(size);
-  gc_entity_init(e, release_callback);
+  gc_entity_init(e, release_callback, name);
   return e;
 }
 
 /** Retain entity */
-void gc_entity_retain(void *arg) {
+void gc_entity_retain_ex(void *arg, const char * caller) {
 	struct entity * e = (entity)arg;
 
-	__sync_fetch_and_add(&e->refs, 1);
+	int refs = __sync_add_and_fetch(&e->refs, 1);
+
+   DEBUG_MSG("[%s] Incrementing refcount of %s %p to %d\n", caller, e->name, e, refs);
 }
 
 int gc_active_entity_count(void) {
@@ -186,6 +196,20 @@ int gc_active_entity_count(void) {
 	return i;
 }
 
+void gc_print_remaining_entities(void) {
+   DEBUG_MSG("Remaining entities:\n");
+
+   GC_LOCK;
+
+   entity e = entities;
+   while (e != NULL) {
+      DEBUG_MSG("  - %s %p\n", e->name, e);
+      e = e->next;
+   }
+
+   GC_UNLOCK;
+}
+
 #undef GC_LOCK
 #undef GC_UNLOCK
 #undef GC_UNLOCK_NO_SIGNAL

+ 12 - 7
socl/src/gc.h

@@ -22,21 +22,26 @@
 void gc_start(void);
 void gc_stop(void);
 
-void gc_entity_init(void *arg, void (*release_callback)(void*));
-void * gc_entity_alloc(unsigned int size, void (*release_callback)(void*));
-void gc_entity_retain(void *arg);
+void gc_entity_init(void *arg, void (*release_callback)(void*), char*name);
+
+void * gc_entity_alloc(unsigned int size, void (*release_callback)(void*), char * name);
+
+void gc_entity_retain_ex(void *arg, const char *);
+#define gc_entity_retain(arg) gc_entity_retain_ex(arg, __func__)
 
 /** Decrement reference counter and release entity if applicable */
-int gc_entity_release_ex(entity e);
+int gc_entity_release_ex(entity e, const char*);
 
 int gc_active_entity_count(void);
+void gc_print_remaining_entities(void);
 
-#define gc_entity_release(a) gc_entity_release_ex(&(a)->_entity)
+#define gc_entity_release(a) gc_entity_release_ex(&(a)->_entity, __func__)
 
 #define gc_entity_store(dest,e) \
   do {\
-    gc_entity_retain(e); \
-    *dest = e;\
+    void * _e = e;\
+    gc_entity_retain(_e); \
+    *dest = _e;\
   } while(0);
 
 #define gc_entity_unstore(dest) \

+ 4 - 6
socl/src/init.c

@@ -32,6 +32,7 @@ void socl_init_starpu(void) {
     struct starpu_conf conf;
     starpu_conf_init(&conf);
     conf.ncuda = 0;
+    conf.ncpus = 0;
 
 
     _starpu_init_failed = starpu_init(&conf);
@@ -40,11 +41,6 @@ void socl_init_starpu(void) {
        DEBUG_MSG("Error when calling starpu_init: %d\n", _starpu_init_failed);
     }
     else {
-       if (starpu_cpu_worker_get_count() == 0)
-       {
-	    DEBUG_MSG("StarPU did not find any CPU device. SOCL needs at least 1 CPU.\n");
-	    _starpu_init_failed = -ENODEV;
-       }
        if (starpu_opencl_worker_get_count() == 0)
        {
 	    DEBUG_MSG("StarPU didn't find any OpenCL device. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
@@ -88,8 +84,10 @@ void soclShutdown() {
 
       int active_entities = gc_active_entity_count();
 
-      if (active_entities != 0)
+      if (active_entities != 0) {
          DEBUG_MSG("Unreleased entities: %d\n", active_entities);
+         gc_print_remaining_entities();
+      }
 
       if( _starpu_init )
          starpu_shutdown();

+ 12 - 19
socl/src/socl.h

@@ -58,6 +58,9 @@ struct entity {
   /* Callback called on release */
   void (*release_callback)(void*entity);
 
+  /* Entity identifier (used for debugging purpose) */
+  char * name;
+
   /* Next entity in garbage collector queue */
   entity prev;
   entity next;
@@ -90,29 +93,19 @@ struct _cl_device_id {
    int worker_id;
 };
 
-#define RETURN_EVENT(cmd, event) \
-	if (event != NULL) { \
-		cl_event ev = command_event_get(cmd);\
-		gc_entity_retain(ev);\
+#define RETURN_EVENT(ev, event) \
+	if ((event) != NULL) { \
 		*event = ev; \
-	}
-
-#define RETURN_CUSTOM_EVENT(src, tgt) \
-	if (tgt != NULL) { \
-		gc_entity_retain(src); \
-		*tgt = src; \
-	}
+	} \
+   else {\
+      gc_entity_release(ev);\
+   }
 
-#define MAY_BLOCK(blocking) \
+#define MAY_BLOCK_THEN_RETURN_EVENT(ev,blocking,event) \
 	if ((blocking) == CL_TRUE) {\
-		cl_event ev = command_event_get(cmd);\
 		soclWaitForEvents(1, &ev);\
-	}
-
-#define MAY_BLOCK_CUSTOM(blocking,event) \
-	if ((blocking) == CL_TRUE) {\
-		soclWaitForEvents(1, &(event));\
-	}
+	}\
+   RETURN_EVENT(ev,event);\
 
 /* Constants */
 const char * SOCL_PROFILE;

+ 39 - 19
socl/src/task.c

@@ -18,14 +18,14 @@
 #include "gc.h"
 #include "event.h"
 
-static void task_release_callback(void *arg) {
-  cl_command cmd = (cl_command)arg;
+void command_completed(cl_command cmd) {
   starpu_task task = cmd->task;
   
-  cl_event ev = command_event_get(cmd);
+  cl_event ev = command_event_get_ex(cmd);
   ev->status = CL_COMPLETE;
 
   /* Trigger the tag associated to the command event */
+  DEBUG_MSG("Trigger event %d\n", ev->id);
   starpu_tag_notify_from_apps(ev->id);
 
   if (task->profiling_info != NULL && (intptr_t)task->profiling_info != -ENOSYS) {
@@ -34,12 +34,17 @@ static void task_release_callback(void *arg) {
   }
 
   gc_entity_release(ev);
-
-  /* Release the command */
-  //FIXME
-  //free(cmd);
 }
 
+void command_completed_task_callback(void *arg) {
+  cl_command cmd = (cl_command)arg;
+
+  command_completed(cmd);
+
+  /* Release the command stored task callback parameter */
+  gc_entity_release(cmd);
+
+}
 
 /*
  * Create a StarPU task
@@ -68,6 +73,7 @@ void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
 
     starpu_tag_t * tags = malloc(num_events * sizeof(starpu_tag_t));	
 
+    DEBUG_MSG("Task %p depends on events:", task);
     for (i=0; i<num_events; i++) {
        tags[i] = events[i]->id;
        DEBUG_MSG_NOHEAD(" %u", events[i]->id);
@@ -85,14 +91,17 @@ cl_int task_submit_ex(starpu_task task, cl_command cmd) {
   /* Associated the task to the command */
   cmd->task = task;
 
-  task_depends_on(task, command_num_events_get(cmd), command_events_get(cmd));
+  cl_uint num_events = command_num_events_get_ex(cmd);
+  cl_event * events = command_events_get_ex(cmd);
+
+  task_depends_on(task, num_events, events);
 
-  task->callback_func = task_release_callback;
-  task->callback_arg = cmd;
+  task->callback_func = command_completed_task_callback;
+  gc_entity_store(&task->callback_arg, cmd);
 
   /* Submit task */
   int ret = (task->cl != NULL && task->cl->where == STARPU_OPENCL ?
-        starpu_task_submit_to_ctx(task, cmd->cq->context->sched_ctx) :
+        starpu_task_submit_to_ctx(task, cmd->event->cq->context->sched_ctx) :
         starpu_task_submit(task));
 
   if (ret != 0)
@@ -110,41 +119,52 @@ struct cputask_arg {
   void (*callback)(void*);
   void * arg;
   int free_arg;
+  cl_command cmd;
+  int complete_cmd;
 };
 
-static void cputask_task(__attribute__((unused)) void *descr[], void *args) {
+static void cputask_task(void *args) {
   struct cputask_arg * arg = (struct cputask_arg*)args;
 
   arg->callback(arg->arg);
 
+  if (arg->complete_cmd)
+     command_completed(arg->cmd);
+
   if (arg->free_arg) {
     assert(arg->arg != NULL);
     free(arg->arg);
     arg->arg = NULL;
   }
 
+  gc_entity_unstore(&arg->cmd);
   free(arg);
 
 }
 
-void cpu_task_submit_ex(cl_command cmd, void (*callback)(void*), void *arg, int free_arg, struct starpu_codelet * codelet, unsigned num_events, cl_event * events) {
+void cpu_task_submit_ex(cl_command cmd, void (*callback)(void*), void *arg, int free_arg, int complete_cmd, struct starpu_codelet * codelet, unsigned num_events, cl_event * events) {
   
   struct cputask_arg * a = malloc(sizeof(struct cputask_arg));
   a->callback = callback;
   a->arg = arg;
   a->free_arg = free_arg;
+  gc_entity_store(&a->cmd, cmd);
+  a->complete_cmd = complete_cmd;
 
-  codelet->where = STARPU_CPU;
-  codelet->cpu_funcs[0] = &cputask_task;
+  codelet->where = STARPU_OPENCL | STARPU_CPU | STARPU_CUDA;
 
   starpu_task task = task_create();
-  task->cl = codelet;
-  task->cl_arg = a;
-
   if (num_events != 0) {
      task_depends_on(task, num_events, events);
   }
 
-  task_submit(task, cmd);
+  task->callback_func = cputask_task;
+  task->callback_arg = a;
+
+  cmd->task = task;
+
+  int ret = starpu_task_submit(task);
+  if (ret != 0)
+     DEBUG_ERROR("Unable to submit a task. Error %d\n", ret);
 }
 

+ 4 - 1
socl/src/task.h

@@ -21,6 +21,9 @@
 
 starpu_task task_create();
 void task_dependency_add(starpu_task task, cl_uint num_events, cl_event *events);
+void command_completed(cl_command cmd);
+
+void command_completed_task_callback(void *);
 
 /* Execute callback(arg) in a CPU task (with no buffer)
  * Associate this task to the command cmd (i.e. when this task completes, the command is completed)
@@ -28,7 +31,7 @@ void task_dependency_add(starpu_task task, cl_uint num_events, cl_event *events)
  * The codelet is used to give a fixed name to the task without allocating a
  * new codelet structure each time. This function will fill the other fields
  * as appropriate */
-void cpu_task_submit_ex(cl_command cmd, void (*callback)(void*), void *arg, int free_arg, struct starpu_codelet *, unsigned num_events, cl_event * events); 
+void cpu_task_submit_ex(cl_command cmd, void (*callback)(void*), void *arg, int free_arg, int release_cmd, struct starpu_codelet *, unsigned num_events, cl_event * events); 
 
 #define cpu_task_submit(cmd, args...) cpu_task_submit_ex((cl_command)cmd, args)