Explorar el Código

Optimize pack/unpack for contiguous matrices/blocks/tensors

Samuel Thibault hace 5 años
padre
commit
3d4ebe75e4

+ 45 - 17
src/datawizard/interfaces/block_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2011,2012,2017                           Inria
  * Copyright (C) 2010-2017,2019                           CNRS
  *
@@ -237,17 +237,31 @@ static int pack_block_handle(starpu_data_handle_t handle, unsigned node, void **
 		*ptr = (void *)starpu_malloc_on_node_flags(node, *count, 0);
 
 		char *cur = *ptr;
-		char *block_z = block;
-		for(z=0 ; z<block_interface->nz ; z++)
+
+		if (block_interface->nx * block_interface->ny == block_interface->ldz && block_interface->nx == block_interface->ldy)
+			memcpy(cur, block, block_interface->nx * block_interface->ny * block_interface->nz * block_interface->elemsize);
+		else
 		{
-			char *block_y = block_z;
-			for(y=0 ; y<block_interface->ny ; y++)
+			char *block_z = block;
+			for(z=0 ; z<block_interface->nz ; z++)
 			{
-				memcpy(cur, block_y, block_interface->nx*block_interface->elemsize);
-				cur += block_interface->nx*block_interface->elemsize;
-				block_y += block_interface->ldy * block_interface->elemsize;
+				if (block_interface->nx == block_interface->ldy)
+				{
+					memcpy(cur, block_z, block_interface->nx * block_interface->ny * block_interface->elemsize);
+					cur += block_interface->nx*block_interface->ny*block_interface->elemsize;
+				}
+				else
+				{
+					char *block_y = block_z;
+					for(y=0 ; y<block_interface->ny ; y++)
+					{
+						memcpy(cur, block_y, block_interface->nx*block_interface->elemsize);
+						cur += block_interface->nx*block_interface->elemsize;
+						block_y += block_interface->ldy * block_interface->elemsize;
+					}
+				}
+				block_z += block_interface->ldz * block_interface->elemsize;
 			}
-			block_z += block_interface->ldz * block_interface->elemsize;
 		}
 	}
 
@@ -266,17 +280,31 @@ static int unpack_block_handle(starpu_data_handle_t handle, unsigned node, void
 	uint32_t z, y;
 	char *cur = ptr;
 	char *block = (void *)block_interface->ptr;
-	char *block_z = block;
-	for(z=0 ; z<block_interface->nz ; z++)
+
+	if (block_interface->nx * block_interface->ny == block_interface->ldz && block_interface->nx == block_interface->ldy)
+		memcpy(block, cur, block_interface->nx * block_interface->ny * block_interface->nz * block_interface->elemsize);
+	else
 	{
-		char *block_y = block_z;
-		for(y=0 ; y<block_interface->ny ; y++)
+		char *block_z = block;
+		for(z=0 ; z<block_interface->nz ; z++)
 		{
-			memcpy(block_y, cur, block_interface->nx*block_interface->elemsize);
-			cur += block_interface->nx*block_interface->elemsize;
-			block_y += block_interface->ldy * block_interface->elemsize;
+			if (block_interface->nx == block_interface->ldy)
+			{
+				memcpy(block_z, cur, block_interface->nx * block_interface->ny * block_interface->elemsize);
+				cur += block_interface->nx*block_interface->ny*block_interface->elemsize;
+			}
+			else
+			{
+				char *block_y = block_z;
+				for(y=0 ; y<block_interface->ny ; y++)
+				{
+					memcpy(block_y, cur, block_interface->nx*block_interface->elemsize);
+					cur += block_interface->nx*block_interface->elemsize;
+					block_y += block_interface->ldy * block_interface->elemsize;
+				}
+			}
+			block_z += block_interface->ldz * block_interface->elemsize;
 		}
-		block_z += block_interface->ldz * block_interface->elemsize;
 	}
 
 	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);

+ 23 - 12
src/datawizard/interfaces/matrix_interface.c

@@ -270,17 +270,22 @@ static int pack_matrix_handle(starpu_data_handle_t handle, unsigned node, void *
 
 	if (ptr != NULL)
 	{
-		uint32_t y;
 		char *matrix = (void *)matrix_interface->ptr;
 
 		*ptr = (void *)starpu_malloc_on_node_flags(node, *count, 0);
-
 		char *cur = *ptr;
-		for(y=0 ; y<matrix_interface->ny ; y++)
+
+		if (matrix_interface->ld == matrix_interface->nx)
+			memcpy(cur, matrix, matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize);
+		else
 		{
-			memcpy(cur, matrix, matrix_interface->nx*matrix_interface->elemsize);
-			cur += matrix_interface->nx*matrix_interface->elemsize;
-			matrix += matrix_interface->ld * matrix_interface->elemsize;
+			uint32_t y;
+			for(y=0 ; y<matrix_interface->ny ; y++)
+			{
+				memcpy(cur, matrix, matrix_interface->nx*matrix_interface->elemsize);
+				cur += matrix_interface->nx*matrix_interface->elemsize;
+				matrix += matrix_interface->ld * matrix_interface->elemsize;
+			}
 		}
 	}
 
@@ -296,14 +301,20 @@ static int unpack_matrix_handle(starpu_data_handle_t handle, unsigned node, void
 
 	STARPU_ASSERT(count == matrix_interface->elemsize * matrix_interface->nx * matrix_interface->ny);
 
-	uint32_t y;
-	char *cur = ptr;
 	char *matrix = (void *)matrix_interface->ptr;
-	for(y=0 ; y<matrix_interface->ny ; y++)
+
+	if (matrix_interface->ld == matrix_interface->nx)
+		memcpy(matrix, ptr, matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize);
+	else
 	{
-		memcpy(matrix, cur, matrix_interface->nx*matrix_interface->elemsize);
-		cur += matrix_interface->nx*matrix_interface->elemsize;
-		matrix += matrix_interface->ld * matrix_interface->elemsize;
+		uint32_t y;
+		char *cur = ptr;
+		for(y=0 ; y<matrix_interface->ny ; y++)
+		{
+			memcpy(matrix, cur, matrix_interface->nx*matrix_interface->elemsize);
+			cur += matrix_interface->nx*matrix_interface->elemsize;
+			matrix += matrix_interface->ld * matrix_interface->elemsize;
+		}
 	}
 
 	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);

+ 76 - 27
src/datawizard/interfaces/tensor_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2011,2012,2017                           Inria
  * Copyright (C) 2010-2017,2019                           CNRS
  *
@@ -247,22 +247,46 @@ static int pack_tensor_handle(starpu_data_handle_t handle, unsigned node, void *
 		*ptr = (void *)starpu_malloc_on_node_flags(node, *count, 0);
 
 		char *cur = *ptr;
-		char *block_t = block;
-		for(t=0 ; t<tensor_interface->nt ; t++)
+		if (tensor_interface->nx * tensor_interface->ny * tensor_interface->nz == tensor_interface->ldt &&
+		    tensor_interface->nx * tensor_interface->ny == tensor_interface->ldz &&
+		    tensor_interface->nx == tensor_interface->ldy)
+			memcpy(cur, block, tensor_interface->nx * tensor_interface->ny * tensor_interface->nz * tensor_interface->nt * tensor_interface->elemsize);
+		else
 		{
-		    char *block_z = block_t;
-		    for(z=0 ; z<tensor_interface->nz ; z++)
-		    {
-			char *block_y = block_z;
-			for(y=0 ; y<tensor_interface->ny ; y++)
+			char *block_t = block;
+			for(t=0 ; t<tensor_interface->nt ; t++)
 			{
-				memcpy(cur, block_y, tensor_interface->nx*tensor_interface->elemsize);
-				cur += tensor_interface->nx*tensor_interface->elemsize;
-				block_y += tensor_interface->ldy * tensor_interface->elemsize;
+				if (tensor_interface->nx * tensor_interface->ny == tensor_interface->ldz &&
+				    tensor_interface->nx == tensor_interface->ldy)
+				{
+					memcpy(cur, block_t, tensor_interface->nx * tensor_interface->ny * tensor_interface->nz * tensor_interface->elemsize);
+					cur += tensor_interface->nx*tensor_interface->ny*tensor_interface->nz*tensor_interface->elemsize;
+				}
+				else
+				{
+					char *block_z = block_t;
+					for(z=0 ; z<tensor_interface->nz ; z++)
+					{
+						if (tensor_interface->nx == tensor_interface->ldy)
+						{
+							memcpy(cur, block_z, tensor_interface->nx * tensor_interface->ny * tensor_interface->elemsize);
+							cur += tensor_interface->nx*tensor_interface->ny*tensor_interface->elemsize;
+						}
+						else
+						{
+							char *block_y = block_z;
+							for(y=0 ; y<tensor_interface->ny ; y++)
+							{
+								memcpy(cur, block_y, tensor_interface->nx*tensor_interface->elemsize);
+								cur += tensor_interface->nx*tensor_interface->elemsize;
+								block_y += tensor_interface->ldy * tensor_interface->elemsize;
+							}
+						}
+						block_z += tensor_interface->ldz * tensor_interface->elemsize;
+					}
+				}
+				block_t += tensor_interface->ldt * tensor_interface->elemsize;
 			}
-			block_z += tensor_interface->ldz * tensor_interface->elemsize;
-		    }
-		    block_t += tensor_interface->ldt * tensor_interface->elemsize;
 		}
 	}
 
@@ -281,22 +305,47 @@ static int unpack_tensor_handle(starpu_data_handle_t handle, unsigned node, void
 	uint32_t t, z, y;
 	char *cur = ptr;
 	char *block = (void *)tensor_interface->ptr;
-	char *block_t = block;
-	for(t=0 ; t<tensor_interface->nt ; t++)
+
+	if (tensor_interface->nx * tensor_interface->ny * tensor_interface->nz == tensor_interface->ldt &&
+	    tensor_interface->nx * tensor_interface->ny == tensor_interface->ldz &&
+	    tensor_interface->nx == tensor_interface->ldy)
+		memcpy(block, cur, tensor_interface->nx * tensor_interface->ny * tensor_interface->nz * tensor_interface->nt * tensor_interface->elemsize);
+	else
 	{
-	    char *block_z = block_t;
-	    for(z=0 ; z<tensor_interface->nz ; z++)
-	    {
-		char *block_y = block_z;
-		for(y=0 ; y<tensor_interface->ny ; y++)
+		char *block_t = block;
+		for(t=0 ; t<tensor_interface->nt ; t++)
 		{
-			memcpy(block_y, cur, tensor_interface->nx*tensor_interface->elemsize);
-			cur += tensor_interface->nx*tensor_interface->elemsize;
-			block_y += tensor_interface->ldy * tensor_interface->elemsize;
+			if (tensor_interface->nx * tensor_interface->ny == tensor_interface->ldz &&
+			    tensor_interface->nx == tensor_interface->ldy)
+			{
+				memcpy(block_t, cur, tensor_interface->nx * tensor_interface->ny * tensor_interface->nz * tensor_interface->elemsize);
+				cur += tensor_interface->nx*tensor_interface->ny*tensor_interface->nz*tensor_interface->elemsize;
+			}
+			else
+			{
+				char *block_z = block_t;
+				for(z=0 ; z<tensor_interface->nz ; z++)
+				{
+					if (tensor_interface->nx == tensor_interface->ldy)
+					{
+						memcpy(block_z, cur, tensor_interface->nx * tensor_interface->ny * tensor_interface->elemsize);
+						cur += tensor_interface->nx*tensor_interface->ny*tensor_interface->elemsize;
+					}
+					else
+					{
+						char *block_y = block_z;
+						for(y=0 ; y<tensor_interface->ny ; y++)
+						{
+							memcpy(block_y, cur, tensor_interface->nx*tensor_interface->elemsize);
+							cur += tensor_interface->nx*tensor_interface->elemsize;
+							block_y += tensor_interface->ldy * tensor_interface->elemsize;
+						}
+					}
+					block_z += tensor_interface->ldz * tensor_interface->elemsize;
+				}
+			}
+			block_t += tensor_interface->ldt * tensor_interface->elemsize;
 		}
-		block_z += tensor_interface->ldz * tensor_interface->elemsize;
-	    }
-	    block_t += tensor_interface->ldt * tensor_interface->elemsize;
 	}
 
 	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);