浏览代码

Add starpu_interface_copy2d, 3d, and 4d

to easily request data copies from data interfaces
Samuel Thibault 5 年之前
父节点
当前提交
a5e0deb768

+ 3 - 1
ChangeLog

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2012-2014,2016-2018                      Inria
-# Copyright (C) 2009-2019                                Université de Bordeaux
+# Copyright (C) 2009-2020                                Université de Bordeaux
 # Copyright (C) 2010-2020                                CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -29,6 +29,8 @@ New features:
 
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
+  * Add starpu_interface_copy2d, 3d, and 4d to easily request data copies from
+    data interfaces.
 
 StarPU 1.3.4 (git revision xxx)
 ==============================================

+ 85 - 2
include/starpu_data_interfaces.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2011-2014,2016,2017                      Inria
  * Copyright (C) 2010-2015,2017,2019                           CNRS
  *
@@ -689,7 +689,90 @@ int starpu_data_interface_get_next_id(void);
    be passed to starpu_interface_copy(). this returns <c>-EAGAIN</c> if the
    transfer is still ongoing, or 0 if the transfer is already completed.
 */
-int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node,
+			  uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			  size_t size, void *async_data);
+
+/**
+   Copy \p numblocks blocks of \p blocksize bytes from byte offset \p src_offset
+   of \p src on \p src_node to byte offset \p dst_offset of \p dst on \p
+   dst_node.
+
+   The blocks start at addresses with are ld_src (resp. ld_dst) bytes apart in
+   the source (resp. destination) interface.
+
+   If blocksize == ld_src == ld_dst, the transfer is optimized into a single
+   starpu_interface_copy call.
+
+   This is to be used in the starpu_data_copy_methods::any_to_any copy
+   method for 2D data, which is provided with \p async_data to be passed to
+   starpu_interface_copy(). this returns <c>-EAGAIN</c> if the transfer is still
+   ongoing, or 0 if the transfer is already completed.
+*/
+int starpu_interface_copy2d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks, size_t ld_src, size_t ld_dst,
+			    void *async_data);
+
+/**
+   Copy \p numblocks_1 * \p numblocks_2 blocks of \p blocksize bytes from byte
+   offset \p src_offset of \p src on \p src_node to byte offset \p dst_offset of
+   \p dst on \p dst_node.
+
+   The blocks are grouped by \p numblocks_1 blocks whose start addresses are
+   ld1_src (resp. ld1_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   Such groups are grouped by numblocks_2 groups whose start addresses are
+   ld2_src (resp. ld2_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   If the blocks are contiguous, the transfers will be optimized.
+
+   This is to be used in the starpu_data_copy_methods::any_to_any copy
+   method for 3D data, which is provided with \p async_data to be passed to
+   starpu_interface_copy(). this returns <c>-EAGAIN</c> if the transfer is still
+   ongoing, or 0 if the transfer is already completed.
+*/
+int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks1, size_t ld1_src, size_t ld1_dst,
+			    size_t numblocks2, size_t ld2_src, size_t ld2_dst,
+			    void *async_data);
+
+/**
+   Copy \p numblocks_1 * \p numblocks_2 * \p numblocks_3 blocks of \p blocksize
+   bytes from byte offset \p src_offset of \p src on \p src_node to byte offset
+   \p dst_offset of \p dst on \p dst_node.
+
+   The blocks are grouped by \p numblocks_1 blocks whose start addresses are
+   ld1_src (resp. ld1_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   Such groups are grouped by numblocks_2 groups whose start addresses are
+   ld2_src (resp. ld2_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   Such groups are grouped by numblocks_3 groups whose start addresses are
+   ld3_src (resp. ld3_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   If the blocks are contiguous, the transfers will be optimized.
+
+   This is to be used in the starpu_data_copy_methods::any_to_any copy
+   method for 3D data, which is provided with \p async_data to be passed to
+   starpu_interface_copy(). this returns <c>-EAGAIN</c> if the transfer is still
+   ongoing, or 0 if the transfer is already completed.
+*/
+int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks1, size_t ld1_src, size_t ld1_dst,
+			    size_t numblocks2, size_t ld2_src, size_t ld2_dst,
+			    size_t numblocks3, size_t ld3_src, size_t ld3_dst,
+			    void *async_data);
 
 /**
    When an asynchonous implementation of the data transfer is implemented, the call

+ 113 - 0
src/datawizard/copy_driver.c

@@ -316,6 +316,119 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 	}
 }
 
+int starpu_interface_copy2d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks, size_t ld_src, size_t ld_dst,
+			    void *async_data)
+{
+	int ret = 0;
+	unsigned i;
+
+	STARPU_ASSERT_MSG(ld_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld_src);
+	STARPU_ASSERT_MSG(ld_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld_dst);
+
+	if (ld_src == blocksize && ld_dst == blocksize)
+		/* Optimize contiguous case */
+		return starpu_interface_copy(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     blocksize * numblocks, async_data);
+
+	/* TODO: introduce and call node_ops->copy2d_data_to when available */
+
+	for (i = 0; i < numblocks; i++)
+	{
+		if (starpu_interface_copy(src, src_offset + i*ld_src, src_node,
+					  dst, dst_offset + i*ld_dst, dst_node,
+					  blocksize, async_data))
+			ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
+int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
+			    size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
+			    void *async_data)
+{
+	int ret = 0;
+	unsigned i;
+
+	STARPU_ASSERT_MSG(ld1_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld1_src);
+	STARPU_ASSERT_MSG(ld1_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld1_dst);
+
+	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
+	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
+
+	if (ld1_src * ld2_src == blocksize * numblocks_1 &&
+	    ld1_dst * ld2_dst == blocksize * numblocks_1)
+		/* Optimize contiguous case */
+		return starpu_interface_copy(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     blocksize * numblocks_1 * numblocks_2,
+					     async_data);
+
+	/* TODO: introduce and call node_ops->copy3d_data_to when available */
+
+	for (i = 0; i < numblocks_2; i++)
+	{
+		if (starpu_interface_copy2d(src, src_offset + i*ld2_src, src_node,
+					    dst, dst_offset + i*ld2_dst, dst_node,
+					    blocksize, numblocks_1, ld1_src, ld1_dst,
+					    async_data))
+			ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
+int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
+			    size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
+			    size_t numblocks_3, size_t ld3_src, size_t ld3_dst,
+			    void *async_data)
+{
+	int ret = 0;
+	unsigned i;
+
+	STARPU_ASSERT_MSG(ld1_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld1_src);
+	STARPU_ASSERT_MSG(ld1_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld1_dst);
+
+	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
+	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
+
+	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
+	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
+
+	if (ld1_src * ld2_src * ld3_src == blocksize * numblocks_1 * numblocks_2 &&
+	    ld1_dst * ld2_dst * ld3_dst == blocksize * numblocks_1 * numblocks_2)
+		/* Optimize contiguous case */
+		return starpu_interface_copy(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     blocksize * numblocks_1 * numblocks_2 * numblocks_3,
+					     async_data);
+
+	/* Probably won't ever have a 4D interface in drivers :) */
+
+	for (i = 0; i < numblocks_3; i++)
+	{
+		if (starpu_interface_copy3d(src, src_offset + i*ld3_src, src_node,
+					    dst, dst_offset + i*ld3_dst, dst_node,
+					    blocksize,
+					    numblocks_1, ld1_src, ld1_dst,
+					    numblocks_2, ld2_src, ld2_dst,
+					    async_data))
+			ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
 {
 #ifdef STARPU_SIMGRID

+ 7 - 43
src/datawizard/interfaces/block_interface.c

@@ -783,49 +783,13 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 	uint32_t ldy_dst = dst_block->ldy;
 	uint32_t ldz_dst = dst_block->ldz;
 
-	if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, ldy_src, ldz_src) &&
-	    IS_CONTIGUOUS_BLOCK(nx, ny, nz, ldy_dst, ldz_dst))
-	{
-		/* Optimise non-partitioned and z-partitioned case */
-		if (starpu_interface_copy(src_block->dev_handle, src_block->offset, src_node,
-		                          dst_block->dev_handle, dst_block->offset, dst_node,
-		                          nx*ny*nz*elemsize, async_data))
-				ret = -EAGAIN;
-	}
-	else
-	{
-		unsigned z;
-		for (z = 0; z < nz; z++)
-		{
-			if (IS_CONTIGUOUS_MATRIX(nx, ny, ldy_src) &&
-			    IS_CONTIGUOUS_MATRIX(nx, ny, ldy_dst))
-			{
-				/* Optimise y-partitioned case */
-				uint32_t src_offset = z*ldz_src*elemsize;
-				uint32_t dst_offset = z*ldz_dst*elemsize;
-
-				if (starpu_interface_copy(src_block->dev_handle, src_block->offset + src_offset, src_node,
-							  dst_block->dev_handle, dst_block->offset + dst_offset, dst_node,
-							  nx*ny*elemsize, async_data))
-					ret = -EAGAIN;
-			}
-			else
-			{
-				unsigned y;
-				for (y = 0; y < ny; y++)
-				{
-					/* Eerf, x-partitioned case */
-					uint32_t src_offset = (y*ldy_src + z*ldz_src)*elemsize;
-					uint32_t dst_offset = (y*ldy_dst + z*ldz_dst)*elemsize;
-
-					if (starpu_interface_copy(src_block->dev_handle, src_block->offset + src_offset, src_node,
-								  dst_block->dev_handle, dst_block->offset + dst_offset, dst_node,
-								  nx*elemsize, async_data))
-						ret = -EAGAIN;
-				}
-			}
-		}
-	}
+	if (starpu_interface_copy3d(src_block->dev_handle, src_block->offset, src_node,
+				    dst_block->dev_handle, dst_block->offset, dst_node,
+				    nx * elemsize,
+				    ny, ldy_src * elemsize, ldy_dst * elemsize,
+				    nz, ldz_src * elemsize, ldz_dst * elemsize,
+				    async_data))
+		ret = -EAGAIN;
 
 	starpu_interface_data_copy(src_node, dst_node, nx*ny*nz*elemsize);
 

+ 6 - 22
src/datawizard/interfaces/matrix_interface.c

@@ -729,28 +729,12 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 	uint32_t ld_src = src_matrix->ld;
 	uint32_t ld_dst = dst_matrix->ld;
 
-	if (IS_CONTIGUOUS_MATRIX(nx, ny, ld_src) && ld_dst == ld_src)
-	{
-		/* Optimize unpartitioned and y-partitioned cases */
-		if (starpu_interface_copy(src_matrix->dev_handle, src_matrix->offset, src_node,
-		                          dst_matrix->dev_handle, dst_matrix->offset, dst_node,
-		                          nx*ny*elemsize, async_data))
-			ret = -EAGAIN;
-	}
-	else
-	{
-		unsigned y;
-		for (y = 0; y < ny; y++)
-		{
-			uint32_t src_offset = y*ld_src*elemsize;
-			uint32_t dst_offset = y*ld_dst*elemsize;
-
-			if (starpu_interface_copy(src_matrix->dev_handle, src_matrix->offset + src_offset, src_node,
-						  dst_matrix->dev_handle, dst_matrix->offset + dst_offset, dst_node,
-						  nx*elemsize, async_data))
-				ret = -EAGAIN;
-		}
-	}
+	if (starpu_interface_copy2d(src_matrix->dev_handle, src_matrix->offset, src_node,
+				    dst_matrix->dev_handle, dst_matrix->offset, dst_node,
+				    nx * elemsize,
+				    ny, ld_src * elemsize, ld_dst * elemsize,
+				    async_data))
+		ret = -EAGAIN;
 
 	starpu_interface_data_copy(src_node, dst_node, (size_t)nx*ny*elemsize);
 

+ 8 - 47
src/datawizard/interfaces/tensor_interface.c

@@ -834,53 +834,14 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 	uint32_t ldz_dst = dst_block->ldz;
 	uint32_t ldt_dst = dst_block->ldt;
 
-	if (IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, ldy_src, ldz_src, ldt_src) &&
-	    IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, ldy_dst, ldz_dst, ldt_dst))
-	{
-		/* Optimise non-partitioned and z-partitioned case */
-		if (starpu_interface_copy(src_block->dev_handle, src_block->offset, src_node,
-		                          dst_block->dev_handle, dst_block->offset, dst_node,
-		                          nx*ny*nz*nt*elemsize, async_data))
-				ret = -EAGAIN;
-	}
-	else
-	{
-		unsigned t;
-		for (t = 0; t < nt; t++)
-		{
-		    unsigned z;
-		    for (z = 0; z < nz; z++)
-		    {
-			if (IS_CONTIGUOUS_MATRIX(nx, ny, ldy_src) &&
-			    IS_CONTIGUOUS_MATRIX(nx, ny, ldy_dst))
-			{
-				/* Optimise y-partitioned case */
-				uint32_t src_offset = t*ldt_src*elemsize + z*ldz_src*elemsize;
-				uint32_t dst_offset = t*ldt_dst*elemsize + z*ldz_dst*elemsize;
-
-				if (starpu_interface_copy(src_block->dev_handle, src_block->offset + src_offset, src_node,
-							  dst_block->dev_handle, dst_block->offset + dst_offset, dst_node,
-							  nx*ny*elemsize, async_data))
-					ret = -EAGAIN;
-			}
-			else
-			{
-				unsigned y;
-				for (y = 0; y < ny; y++)
-				{
-					/* Eerf, x-partitioned case */
-					uint32_t src_offset = (y*ldy_src + z*ldz_src + t*ldt_src)*elemsize;
-					uint32_t dst_offset = (y*ldy_dst + z*ldz_dst + t*ldt_dst)*elemsize;
-
-					if (starpu_interface_copy(src_block->dev_handle, src_block->offset + src_offset, src_node,
-								  dst_block->dev_handle, dst_block->offset + dst_offset, dst_node,
-								  nx*elemsize, async_data))
-						ret = -EAGAIN;
-				}
-			}
-		    }
-		}
-	}
+	if (starpu_interface_copy4d(src_block->dev_handle, src_block->offset, src_node,
+				    dst_block->dev_handle, dst_block->offset, dst_node,
+				    nx * elemsize,
+				    ny, ldy_src * elemsize, ldy_dst * elemsize,
+				    nz, ldz_src * elemsize, ldz_dst * elemsize,
+				    nt, ldt_src * elemsize, ldt_dst * elemsize,
+				    async_data))
+		ret = -EAGAIN;
 
 	starpu_interface_data_copy(src_node, dst_node, nx*ny*nz*nt*elemsize);