лет назад: 11 · 7aa2a1ddf3
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -60,14 +60,14 @@ StarPU already does appropriate calls for the CUBLAS library.
 
				 If the kernel can be made to only use this local stream or other self-allocated
			
 
				 streams, i.e. the whole kernel submission can be made asynchronous, then
			
 
				 one should enable asynchronous execution of the kernel.  That means setting
			
 
				-the STARPU_CUDA_FLAG flag in cuda_flags[] in the codelet, and dropping the
			
 
				+the STARPU_CUDA_ASYNC flag in cuda_flags[] in the codelet, and dropping the
			
 
				 cudaStreamSynchronize() call at the end of the cuda_func function, so that it
			
 
				 returns immediately after having queued the kernel to the local stream. That way, StarPU will be
			
 
				 able to submit and complete data transfers while kernels are executing, instead of only at each
			
 
				 kernel submission. The kernel just has to make sure that StarPU can use the
			
 
				 local stream to synchronize with the kernel startup and completion.
			
 
				 
			
 
				-Using the STARPU_CUDA_FLAG flag also permits to enabled concurrent kernel
			
 
				+Using the STARPU_CUDA_ASYNC flag also permits to enable concurrent kernel
			
 
				 execution, on cards which support it (Kepler and later, notably). This is
			
 
				 enabled by setting the STARPU_NWORKER_PER_CUDA environment variable to the
			
 
				 number of kernels to execute concurrently.  This is useful when kernels are
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -101,6 +101,7 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 
				 	int n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	cublasSaxpy(n, 1., x, 1, y, 1);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -685,11 +685,16 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
				 		int err;
			
 
				 		cl_command_queue queue;
			
 
				 		starpu_opencl_get_queue(args->devid, &queue);
			
 
				-#ifdef HAVE_CLENQUEUEMARKERWITHWAITLIST
			
 
				-		err = clEnqueueMarkerWithWaitList(queue, 0, NULL, &task_events[args->devid]);
			
 
				-#else
			
 
				+		/* the function clEnqueueMarker is deprecated from
			
 
				+		 * OpenCL version 1.2. We would like to use the new
			
 
				+		 * function clEnqueueMarkerWithWaitList. We could do
			
 
				+		 * it by checking its availability through our own
			
 
				+		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
			
 
				+		 * and the OpenCL macro CL_VERSION_1_2. However these
			
 
				+		 * 2 macros detect the function availability in the
			
 
				+		 * ICD and not in the device implementation.
			
 
				+		 */
			
 
				 		err = clEnqueueMarker(queue, &task_events[args->devid]);
			
 
				-#endif
			
 
				 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 	else