4 роки тому · ca3df41ccc
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -99,6 +99,21 @@ cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
				 as well as the use of \c cudaMemcpyAsync(), etc. for each CUDA operation one needs
			
 
				 to use a version that takes the a stream parameter.
			
 
				 
			
 
				+If the kernel uses its own non-default stream, one can synchronize this stream
			
 
				+with the StarPU-provided stream this way:
			
 
				+
			
 
				+\code{.c}
			
 
				+cudaEvent_t event;
			
 
				+call_kernel_with_its_own_stream()
			
 
				+cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
			
 
				+cudaEventRecord(event, get_kernel_stream());
			
 
				+cudaStreamWaitEvent(starpu_cuda_get_local_stream(), event, 0);
			
 
				+cudaEventDestroy(event);
			
 
				+\endcode
			
 
				+
			
 
				+This code makes the StarPU-provided stream wait for a new event, which will be
			
 
				+triggered by the completion of the kernel.
			
 
				+
			
 
				 Unfortunately, some CUDA libraries do not have stream variants of
			
 
				 kernels. This will seriously lower the potential for overlapping.
			
 
				 If some CUDA calls are made without specifying this local stream,
			
@@ -129,21 +144,6 @@ able to submit and complete data transfers while kernels are executing, instead
 
				 kernel submission. The kernel just has to make sure that StarPU can use the
			
 
				 local stream to synchronize with the kernel startup and completion.
			
 
				 
			
 
				-If the kernel uses its own non-default stream, one can synchronize this stream
			
 
				-with the StarPU-provided stream this way:
			
 
				-
			
 
				-\code{.c}
			
 
				-cudaEvent_t event;
			
 
				-call_kernel_with_its_own_stream()
			
 
				-cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
			
 
				-cudaEventRecord(event, get_kernel_stream());
			
 
				-cudaStreamWaitEvent(starpu_cuda_get_local_stream(), event, 0);
			
 
				-cudaEventDestroy(event);
			
 
				-\endcode
			
 
				-
			
 
				-This code makes the StarPU-provided stream wait for a new event, which will be
			
 
				-triggered by the completion of the kernel.
			
 
				-
			
 
				 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
			
 
				 execution, on cards which support it (Kepler and later, notably). This is
			
 
				 enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the