| 
					
				 | 
			
			
				@@ -50,7 +50,7 @@ static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-static cl_event task_events[STARPU_MAXOPENCLDEVS]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static cl_event task_events[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 void 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -597,6 +597,8 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	snprintf(worker->name, sizeof(worker->name), "OpenCL %u (%s %.1f GiB)", devid, devname, size); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	_STARPU_TRACE_WORKER_INIT_END(worker->workerid); 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -619,16 +621,17 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	struct starpu_task *task; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #ifndef STARPU_SIMGRID 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	task = starpu_task_get_current(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if (task) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (worker->ntasks) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		cl_int status; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		size_t size; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		int err; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		/* On-going asynchronous task, check for its termination first */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		err = clGetEventInfo(task_events[worker->devid], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		task = worker->current_tasks[worker->first_task]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		err = clGetEventInfo(task_events[worker->devid][worker->first_task], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		STARPU_ASSERT(size == sizeof(cl_int)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -640,16 +643,38 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 			return 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		task_events[worker->devid][worker->first_task] = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		/* Asynchronous task completed! */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		_STARPU_TRACE_END_EXECUTING(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), worker); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		/* See next task if any */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		if (worker->ntasks) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			task = worker->current_tasks[worker->first_task]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			j = _starpu_get_job_associated_to_task(task); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				/* An asynchronous task, it was already queued, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				 * it's now running, record its start time.  */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get()); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			else 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				/* A synchronous task, we have finished flushing the pipeline, we can now at last execute it.  */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				_STARPU_TRACE_END_PROGRESS(memnode); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				_STARPU_TRACE_EVENT("sync_task"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				_starpu_opencl_execute_job(task, worker); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				_STARPU_TRACE_EVENT("end_sync_task"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				_STARPU_TRACE_START_PROGRESS(memnode); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				worker->pipeline_stuck = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		_STARPU_TRACE_END_EXECUTING(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #endif /* STARPU_SIMGRID */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	__starpu_datawizard_progress(memnode, 1, 1); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_STARPU_TRACE_END_PROGRESS(memnode); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	task = _starpu_get_worker_task(worker, workerid, memnode); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	if (task == NULL) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -665,8 +690,20 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		return 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_starpu_opencl_execute_job(task, worker); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	worker->current_tasks[(worker->first_task  + worker->ntasks)%STARPU_MAX_PIPELINE] = task; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	worker->ntasks++; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (worker->ntasks > 1 && !(task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		/* We have to execute a non-asynchronous task but we 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		 * still have tasks in the pipeline...  Record it to 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		 * prevent more tasks from coming, and do it later */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		worker->pipeline_stuck = 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		return 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	_STARPU_TRACE_END_PROGRESS(memnode); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	_starpu_opencl_execute_job(task, worker); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	_STARPU_TRACE_START_PROGRESS(memnode); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	return 0; 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -772,7 +809,6 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	STARPU_ASSERT(cl); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	_starpu_set_current_task(j->task); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	worker->current_task = j->task; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	ret = _starpu_fetch_task_input(j); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	if (ret != 0) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -783,7 +819,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		return -EAGAIN; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (worker->ntasks == 1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		/* We are alone in the pipeline, the kernel will start now, record it */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined"); 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -821,7 +861,9 @@ static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	int profiling = starpu_profiling_status_get(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	_starpu_set_current_task(NULL); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	worker->current_task = NULL; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	worker->current_tasks[worker->first_task] = NULL; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	worker->ntasks--; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -870,7 +912,7 @@ static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		 * 2 macros detect the function availability in the 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		 * ICD and not in the device implementation. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		 */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		err = clEnqueueMarker(queue, &task_events[worker->devid]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		_STARPU_TRACE_START_EXECUTING(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 |