Просмотр исходного кода

Trace async transfers differently to watch for cuda spurious blocking

Samuel Thibault лет назад: 13
Родитель
Сommit
209ebef2ac

+ 15 - 6
src/common/fxt.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -90,14 +90,17 @@
 #define	STARPU_FUT_START_DRIVER_COPY	0x5133
 #define	STARPU_FUT_END_DRIVER_COPY	0x5134
 
-#define	STARPU_FUT_START_PROGRESS	0x5135
-#define	STARPU_FUT_END_PROGRESS		0x5136
+#define	STARPU_FUT_START_DRIVER_COPY_ASYNC	0x5135
+#define	STARPU_FUT_END_DRIVER_COPY_ASYNC	0x5136
 
-#define STARPU_FUT_USER_EVENT		0x5137
+#define	STARPU_FUT_START_PROGRESS	0x5137
+#define	STARPU_FUT_END_PROGRESS		0x5138
 
-#define STARPU_FUT_SET_PROFILING	0x5138
+#define STARPU_FUT_USER_EVENT		0x5139
 
-#define STARPU_FUT_TASK_WAIT_FOR_ALL	0x5139
+#define STARPU_FUT_SET_PROFILING	0x513a
+
+#define STARPU_FUT_TASK_WAIT_FOR_ALL	0x513b
 
 #ifdef STARPU_USE_FXT
 #include <sys/syscall.h> /* pour les définitions de SYS_xxx */
@@ -277,6 +280,12 @@ do {										\
 #define STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id)	\
 	FUT_DO_PROBE4(STARPU_FUT_END_DRIVER_COPY, src_node, dst_node, size, com_id)
 
+#define STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node)	\
+	FUT_DO_PROBE2(STARPU_FUT_START_DRIVER_COPY_ASYNC, src_node, dst_node)
+
+#define STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node)	\
+	FUT_DO_PROBE2(STARPU_FUT_END_DRIVER_COPY, src_node, dst_node)
+
 #define STARPU_TRACE_WORK_STEALING(empty_q, victim_q)		\
 	FUT_DO_PROBE2(STARPU_FUT_WORK_STEALING, empty_q, victim_q)
 

+ 7 - 7
src/datawizard/interfaces/bcsr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -474,15 +474,15 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, (void *)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
+	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->nzval, src_node, (void *)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, (void *)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->colind, src_node, (void *)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, (void *)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_bcsr->rowptr, src_node, (void *)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -505,15 +505,15 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, (cl_mem)dst_bcsr->nzval, nnz*r*c*elemsize, 0, NULL);
+	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->nzval, src_node, (cl_mem)dst_bcsr->nzval, dst_node, nnz*r*c*elemsize, 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, (cl_mem)dst_bcsr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->colind, src_node, (cl_mem)dst_bcsr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, (cl_mem)dst_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+	err = _starpu_opencl_copy_ram_to_opencl((void *)src_bcsr->rowptr, src_node, (cl_mem)dst_bcsr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
 		STARPU_OPENCL_REPORT_ERROR(err);
 

+ 12 - 6
src/datawizard/interfaces/block_interface.c

@@ -468,8 +468,10 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
+			STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
 					nx*ny*nz*elemsize, kind, stream);
+			STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
@@ -486,9 +488,11 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		}
 		else {
 			/* Are all plans contiguous */
+			STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
 					(char *)src_block->ptr, src_block->ldz*elemsize,
 					nx*ny*elemsize, nz, kind, stream);
+			STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
@@ -512,9 +516,11 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
 			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
 
+			STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
                                                   (char *)src_ptr, src_block->ldy*elemsize,
                                                   nx*elemsize, ny, kind, stream);
+			STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 
 			if (STARPU_UNLIKELY(cures))
 			{
@@ -597,7 +603,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 		/* Is that a single contiguous buffer ? */
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-                        err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
+                        err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
                                                                            src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
                                                                            dst_block->offset, (cl_event*)_event, &ret);
                         if (STARPU_UNLIKELY(err))
@@ -617,7 +623,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
                         unsigned j;
                         for(j=0 ; j<src_block->ny ; j++) {
                                 void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
-                                err = _starpu_opencl_copy_ram_to_opencl(ptr, (cl_mem)dst_block->dev_handle,
+                                err = _starpu_opencl_copy_ram_to_opencl(ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
                                                                         src_block->nx*src_block->elemsize,
                                                                         layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
                                                                         + dst_block->offset, NULL);
@@ -636,7 +642,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
                         //                        size_t host_row_pitch=region[0];
                         //                        size_t host_slice_pitch=region[1] * host_row_pitch;
                         //
-                        //                        _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, (cl_mem)dst_block->dev_handle,
+                        //                        _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, src_node, (cl_mem)dst_block->dev_handle, dst_node,
                         //                                                               buffer_origin, host_origin, region,
                         //                                                               buffer_row_pitch, buffer_slice_pitch,
                         //                                                               host_row_pitch, host_slice_pitch, NULL);
@@ -661,7 +667,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 		/* Is that a single contiguous buffer ? */
 		if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
-                        err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
+                        err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, src_node, (void*)dst_block->ptr, dst_node,
                                                                            src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
                                                                            src_block->offset, (cl_event*)_event, &ret);
                         if (STARPU_UNLIKELY(err))
@@ -682,7 +688,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
                         unsigned j;
                         for(j=0 ; j<src_block->ny ; j++) {
                                 void *ptr = (void *)dst_block->ptr+(layer*dst_block->ldz*dst_block->elemsize)+(j*dst_block->ldy*dst_block->elemsize);
-                                err = _starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, ptr,
+                                err = _starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, src_node, ptr, dst_node,
                                                                         src_block->nx*src_block->elemsize,
                                                                         layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
                                                                         src_block->offset, NULL);
@@ -695,7 +701,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
                         //                        size_t host_row_pitch=region[0];
                         //                        size_t host_slice_pitch=region[1] * host_row_pitch;
                         //
-                        //                        _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, (void *)dst_block->ptr,
+                        //                        _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, src_node, (void *)dst_block->ptr, dst_node,
                         //                                                               buffer_origin, host_origin, region,
                         //                                                               buffer_row_pitch, buffer_slice_pitch,
                         //                                                               host_row_pitch, host_slice_pitch, NULL);

+ 17 - 7
src/datawizard/interfaces/csr_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -427,10 +427,12 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 
 	int synchronous_fallback = 0;
 
+	STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 	cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
 	if (cures)
 	{
 		synchronous_fallback = 1;
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -444,6 +446,7 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 	if (synchronous_fallback || cures != cudaSuccess)
 	{
 		synchronous_fallback = 1;
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -457,6 +460,7 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 	if (synchronous_fallback || cures != cudaSuccess)
 	{
 		synchronous_fallback = 1;
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -468,6 +472,7 @@ static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_
 		return 0;
 	}
 	else {
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		return -EAGAIN;
 	}
 }
@@ -526,10 +531,12 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 
 	int synchronous_fallback = 0;
 
+	STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
 	if (cures)
 	{
 		synchronous_fallback = 1;
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -543,6 +550,7 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 	if (synchronous_fallback || cures != cudaSuccess)
 	{
 		synchronous_fallback = 1;
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -556,6 +564,7 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 	if (synchronous_fallback || cures != cudaSuccess)
 	{
 		synchronous_fallback = 1;
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -567,6 +576,7 @@ static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, uns
 		return 0;
 	}
 	else {
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		return -EAGAIN;
 	}
 #else
@@ -626,15 +636,15 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-        err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
+        err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-        err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+        err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
@@ -654,15 +664,15 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTR
 
         int err;
 
-        err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
+        err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, src_node, (cl_mem)dst_csr->nzval, dst_node, nnz*elemsize, 0, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
+	err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, src_node, (cl_mem)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), 0, NULL);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 
-        err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
+        err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, src_node, (cl_mem)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), 0, NULL);
 	if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
 

+ 10 - 2
src/datawizard/interfaces/matrix_interface.c

@@ -385,7 +385,9 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
 	if (is_async)
 	{
+		STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy3DAsync(&p, stream);
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -397,9 +399,11 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
 	if (is_async)
 	{
+		STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -466,7 +470,9 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
 //	if (is_async)
 //	{
+//		STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 //		cures = cudaMemcpy3DPeerAsync(&p, stream);
+//		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 //		if (!cures)
 //			return -EAGAIN;
 //	}
@@ -478,7 +484,9 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
 	if (is_async)
 	{
+		STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -547,7 +555,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
 	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
 
-	err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, (cl_mem)dst_matrix->dev_handle,
+	err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_matrix->ptr, src_node, (cl_mem)dst_matrix->dev_handle, dst_node,
                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
                                                            dst_matrix->offset, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
@@ -567,7 +575,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 	/* XXX non contiguous matrices are not supported with OpenCL yet ! (TODO) */
 	STARPU_ASSERT((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx));
 
-        err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, (void*)dst_matrix->ptr,
+        err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_matrix->dev_handle, src_node, (void*)dst_matrix->ptr, dst_node,
                                                            src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
                                                            src_matrix->offset, (cl_event*)_event, &ret);
 

+ 6 - 2
src/datawizard/interfaces/variable_interface.c

@@ -346,7 +346,9 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 	starpu_variable_interface_t *dst_variable = dst_interface;
 
 	cudaError_t cures;
+	STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, stream);
+	STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 	if (cures)
 	{
 		/* do it in a synchronous fashion */
@@ -393,7 +395,9 @@ static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					v
 		size_t length = src_variable->elemsize;
 
 		cudaError_t cures;
+		STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeerAsync((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length, stream);
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (cures)
 		{
 			/* sychronous fallback */
@@ -426,7 +430,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 	starpu_variable_interface_t *dst_variable = dst_interface;
         int err,ret;
 
-        err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_variable->ptr, (cl_mem)dst_variable->ptr, src_variable->elemsize,
+        err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_variable->ptr, src_node, (cl_mem)dst_variable->ptr, dst_node, src_variable->elemsize,
                                                            0, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);
@@ -442,7 +446,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 	starpu_variable_interface_t *dst_variable = dst_interface;
         int err, ret;
 
-	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_variable->ptr, (void*)dst_variable->ptr, src_variable->elemsize,
+	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_variable->ptr, src_node, (void*)dst_variable->ptr, dst_node, src_variable->elemsize,
                                                            0, (cl_event*)_event, &ret);
 
         if (STARPU_UNLIKELY(err))

+ 6 - 2
src/datawizard/interfaces/vector_interface.c

@@ -356,9 +356,11 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 
 	if (is_async)
 	{
+		STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 		cures = cudaMemcpyPeerAsync((char *)dst_vector->ptr, dst_dev,
 						(char *)src_vector->ptr, src_dev,
 						length, stream);
+		STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -413,7 +415,9 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
 	cudaError_t cures;
 
+	STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
 	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
+	STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
 	if (cures)
 	{
 		/* do it in a synchronous fashion */
@@ -468,7 +472,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 	starpu_vector_interface_t *dst_vector = dst_interface;
         int err, ret;
 
-	err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_vector->ptr, (cl_mem)dst_vector->dev_handle,
+	err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_vector->ptr, src_node, (cl_mem)dst_vector->dev_handle, dst_node,
                                                            src_vector->nx*src_vector->elemsize,
                                                            dst_vector->offset, (cl_event*)_event, &ret);
         if (STARPU_UNLIKELY(err))
@@ -486,7 +490,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 	starpu_vector_interface_t *dst_vector = dst_interface;
         int err, ret;
 
-	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_vector->dev_handle, (void*)dst_vector->ptr, src_vector->nx*src_vector->elemsize,
+	err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_vector->dev_handle, src_node, (void*)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize,
                                                            src_vector->offset, (cl_event *)_event, &ret);
         if (STARPU_UNLIKELY(err))
                 STARPU_OPENCL_REPORT_ERROR(err);

+ 33 - 0
src/debug/traces/starpu_fxt.c

@@ -572,6 +572,29 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 	}
 }
 
+static void handle_start_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	unsigned dst = ev->param[1];
+
+	char *prefix = options->file_prefix;
+
+	if (!options->no_bus)
+		if (out_paje_file)
+			fprintf(out_paje_file, "10       %f     MS      %sMEMNODE%u      CoA\n", get_event_time_stamp(ev, options), prefix, dst);
+
+}
+
+static void handle_end_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	unsigned dst = ev->param[1];
+
+	char *prefix = options->file_prefix;
+
+	if (!options->no_bus)
+		if (out_paje_file)
+			fprintf(out_paje_file, "10       %f     MS      %sMEMNODE%u      Co\n", get_event_time_stamp(ev, options), prefix, dst);
+}
+
 static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 {
 	unsigned memnode = ev->param[0];
@@ -921,6 +944,16 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_end_driver_copy(&ev, options);
 				break;
 
+			case STARPU_FUT_START_DRIVER_COPY_ASYNC:
+				if (!options->no_bus)
+				handle_start_driver_copy_async(&ev, options);
+				break;
+
+			case STARPU_FUT_END_DRIVER_COPY_ASYNC:
+				if (!options->no_bus)
+				handle_end_driver_copy_async(&ev, options);
+				break;
+
 			case STARPU_FUT_WORK_STEALING:
 				/* XXX */
 				break;

+ 1 - 0
src/debug/traces/starpu_paje.c

@@ -147,6 +147,7 @@ void starpu_fxt_write_paje_header(FILE *file)
 	6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n \
 	6       R       MS      Reclaiming         \".0 .1 .4\"		\n \
 	6       Co       MS     DriverCopy         \".3 .5 .1\"		\n \
+	6       CoA      MS     DriverCopyAsync         \".1 .3 .1\"		\n \
 	6       No       MS     Nothing         \".0 .0 .0\"		\n \
 	5       MPIL     MPIP	P	P      MPIL\n \
 	5       L       P	Mn	Mn      L\n");

+ 31 - 7
src/drivers/opencl/driver_opencl.c

@@ -81,7 +81,6 @@ static void unlimit_gpu_mem_if_needed(int devid)
 
 size_t starpu_opencl_get_global_mem_size(int devid)
 {
-	cl_int err;
 	cl_ulong totalGlobalMem;
 
 	/* Request the size of the current device's memory */
@@ -190,14 +189,19 @@ cl_int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags fla
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event, int *ret)
+cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
 {
         cl_int err;
         struct starpu_worker_s *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+
+        if (event)
+                STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        if (event)
+                STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (STARPU_LIKELY(err == CL_SUCCESS)) {
                 *ret = (event == NULL) ? 0 : -EAGAIN;
                 return CL_SUCCESS;
@@ -218,27 +222,35 @@ cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, si
         }
 }
 
-cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event)
+cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
 {
         cl_int err;
         struct starpu_worker_s *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        if (event)
+                STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event, int *ret)
+cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
 {
         cl_int err;
         struct starpu_worker_s *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        if (event)
+                STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (STARPU_LIKELY(err == CL_SUCCESS)) {
                 *ret = (event == NULL) ? 0 : -EAGAIN;
                 return CL_SUCCESS;
@@ -260,21 +272,25 @@ cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, si
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event)
+cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event)
 {
         cl_int err;
         struct starpu_worker_s *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
+        if (event)
+                STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
 }
 
 #if 0
-cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const size_t buffer_origin[3], const size_t host_origin[3],
+cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
 {
@@ -283,14 +299,18 @@ cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const si
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueReadBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                       buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
+        if (event)
+                STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;
 }
 
-cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const size_t buffer_origin[3], const size_t host_origin[3],
+cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
 {
@@ -299,8 +319,12 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const si
         cl_bool blocking;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
+        if (event)
+                STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
         err = clEnqueueWriteBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                        buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
+        if (event)
+                STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         return CL_SUCCESS;

+ 7 - 7
src/drivers/opencl/driver_opencl.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -43,25 +43,25 @@ extern
 cl_int _starpu_opencl_allocate_memory(void **addr, size_t size, cl_mem_flags flags);
 
 extern
-cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event);
+cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event);
 
 extern
-cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event);
+cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event);
 
 extern
-cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, size_t size, size_t offset, cl_event *event, int *ret);
+cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 
 extern
-cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, size_t size, size_t offset, cl_event *event, int *ret);
+cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 
 #if 0
 extern
-cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const size_t buffer_origin[3], const size_t host_origin[3],
+cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, const size_t buffer_origin[3], const size_t host_origin[3],
                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
 
 extern
-cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const size_t buffer_origin[3], const size_t host_origin[3],
+cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, const size_t buffer_origin[3], const size_t host_origin[3],
                                               const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
 #endif