Olivier Aumage 11 éve
szülő
commit
7aa2a1ddf3

+ 2 - 2
doc/doxygen/chapters/05check_list_performance.doxy

@@ -60,14 +60,14 @@ StarPU already does appropriate calls for the CUBLAS library.
 If the kernel can be made to only use this local stream or other self-allocated
 streams, i.e. the whole kernel submission can be made asynchronous, then
 one should enable asynchronous execution of the kernel.  That means setting
-the STARPU_CUDA_FLAG flag in cuda_flags[] in the codelet, and dropping the
+the STARPU_CUDA_ASYNC flag in cuda_flags[] in the codelet, and dropping the
 cudaStreamSynchronize() call at the end of the cuda_func function, so that it
 returns immediately after having queued the kernel to the local stream. That way, StarPU will be
 able to submit and complete data transfers while kernels are executing, instead of only at each
 kernel submission. The kernel just has to make sure that StarPU can use the
 local stream to synchronize with the kernel startup and completion.
 
-Using the STARPU_CUDA_FLAG flag also permits to enabled concurrent kernel
+Using the STARPU_CUDA_ASYNC flag also permits to enable concurrent kernel
 execution, on cards which support it (Kepler and later, notably). This is
 enabled by setting the STARPU_NWORKER_PER_CUDA environment variable to the
 number of kernels to execute concurrently.  This is useful when kernels are

+ 1 - 0
examples/pipeline/pipeline.c

@@ -101,6 +101,7 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 	int n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cublasSaxpy(n, 1., x, 1, y, 1);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 9 - 4
src/drivers/opencl/driver_opencl.c

@@ -685,11 +685,16 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		int err;
 		cl_command_queue queue;
 		starpu_opencl_get_queue(args->devid, &queue);
-#ifdef HAVE_CLENQUEUEMARKERWITHWAITLIST
-		err = clEnqueueMarkerWithWaitList(queue, 0, NULL, &task_events[args->devid]);
-#else
+		/* the function clEnqueueMarker is deprecated from
+		 * OpenCL version 1.2. We would like to use the new
+		 * function clEnqueueMarkerWithWaitList. We could do
+		 * it by checking its availability through our own
+		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
+		 * and the OpenCL macro CL_VERSION_1_2. However these
+		 * 2 macros detect the function availability in the
+		 * ICD and not in the device implementation.
+		 */
 		err = clEnqueueMarker(queue, &task_events[args->devid]);
-#endif
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	else