Browse Source

Optimize unpartitioned cases for block and matrix

Samuel Thibault 12 years ago
parent
commit
9689a49f5a

+ 23 - 0
src/datawizard/interfaces/block_interface.c

@@ -675,10 +675,33 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 	uint32_t ldz_dst = dst_block->ldz;
 
 	unsigned y, z;
+
+	if (ldy_src == nx && ldy_dst == nx && ldz_src == ny && ldz_dst == ny)
+	{
+		/* Optimise non-partitioned and z-partitioned case */
+		if (starpu_interface_copy(src_block->dev_handle, src_block->offset, src_node,
+		                          dst_block->dev_handle, dst_block->offset, dst_node,
+		                          nx*ny*nz*elemsize, async_data))
+				ret = -EAGAIN;
+	}
+	else
 	for (z = 0; z < nz; z++)
 	{
+		if (ldy_src == nx && ldy_dst == nx)
+		{
+			/* Optimise y-partitioned case */
+			uint32_t src_offset = z*ldz_src*elemsize;
+			uint32_t dst_offset = z*ldz_dst*elemsize;
+
+			if (starpu_interface_copy(src_block->dev_handle, src_block->offset + src_offset, src_node,
+			                          dst_block->dev_handle, dst_block->offset + dst_offset, dst_node,
+			                          nx*ny*elemsize, async_data))
+				ret = -EAGAIN;
+		}
+		else
 		for (y = 0; y < ny; y++)
 		{
+			/* Eerf, x-partitioned case */
 			uint32_t src_offset = (y*ldy_src + z*ldz_src)*elemsize;
 			uint32_t dst_offset = (y*ldy_dst + z*ldz_dst)*elemsize;
 

+ 9 - 0
src/datawizard/interfaces/matrix_interface.c

@@ -635,6 +635,15 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 	uint32_t ld_src = src_matrix->ld;
 	uint32_t ld_dst = dst_matrix->ld;
 
+	if (ld_src == nx && ld_dst == nx)
+	{
+		/* Optimize unpartitioned and y-partitioned cases */
+		if (starpu_interface_copy(src_matrix->dev_handle, src_matrix->offset, src_node,
+		                          dst_matrix->dev_handle, dst_matrix->offset, dst_node,
+		                          nx*ny*elemsize, async_data))
+			ret = -EAGAIN;
+	}
+	else
 	for (y = 0; y < ny; y++)
 	{
 		uint32_t src_offset = y*ld_src*elemsize;