| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449 | /* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2016,2017                                Inria * Copyright (C) 2016,2017,2019                           CNRS * Copyright (C) 2010,2013-2017                           Université de Bordeaux * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */#include "implicit-stencil.h"#include <math.h>/* Manage block and tags allocation */static struct block_description *blocks;static unsigned sizex, sizey, sizez;static unsigned nbz;static unsigned *block_sizes_z;/* *	Tags for various codelet completion *//* * common tag format: */static starpu_tag_t tag_common(int z, int dir, int type){	return (((((starpu_tag_t)type) << 4) | ((dir+1)/2)) << 32)|(starpu_tag_t)z;}/* Completion of last update tasks */starpu_tag_t TAG_FINISH(int z){	z = (z + nbz)%nbz;	starpu_tag_t tag = tag_common(z, 0, 1);	return tag;}/* Completion of the save codelet for MPI send/recv */starpu_tag_t TAG_START(int z, int dir){	z = (z + nbz)%nbz;	starpu_tag_t tag = tag_common(z, dir, 2);	return tag;}/* * common MPI tag format: */static int mpi_tag_common(int z, int dir, int layer_or_boundary, int buffer){	return (z<<12) | (layer_or_boundary << 8) | ((((1+dir)/2))<<4) | buffer;}int MPI_TAG_LAYERS(int z, int buffer){	z = (z + nbz)%nbz;    /* No direction for layers ; layer is 0 */	int tag = mpi_tag_common(z, 0, 0, buffer);	return tag;}int MPI_TAG_BOUNDARIES(int z, int dir, int buffer){	z = (z + nbz)%nbz;	int tag = mpi_tag_common(z, dir, 1, buffer);	return tag;}/* *	Block descriptors *//* Compute the size of the different blocks */static void compute_block_sizes(void){	block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));	STARPU_ASSERT(block_sizes_z);	/* Perhaps the last chunk is smaller */	unsigned default_block_size = (sizez+nbz-1)/nbz;	unsigned remaining = sizez;	unsigned b;	for (b = 0; b < nbz; b++)	{		block_sizes_z[b] = MIN(default_block_size, remaining);		remaining -= block_sizes_z[b];	}	STARPU_ASSERT(remaining == 0);}unsigned get_block_size(int bz){	return block_sizes_z[bz];}struct block_description *get_block_description(int z){	z = (z + nbz)%nbz;	STARPU_ASSERT(&blocks[z]);	return &blocks[z];}int get_block_mpi_node(int z){	z = (z + nbz)%nbz;	return blocks[z].mpi_node;}void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsigned _nbz){	/* Store the parameters */	nbz = _nbz;	sizex = _sizex;	sizey = _sizey;	sizez = _sizez;	/* Create a grid of block descriptors */	blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));	STARPU_ASSERT(blocks);	/* What is the size of the different blocks ? */	compute_block_sizes();	unsigned bz;	for (bz = 0; bz < nbz; bz++)	{		struct block_description * block =				get_block_description(bz);		/* Which block is it ? */		block->bz = bz;		/* For simplicity, we store which are the neighbours blocks */		block->boundary_blocks[B] = get_block_description((bz-1+nbz)%nbz);		block->boundary_blocks[T] = get_block_description((bz+1)%nbz);	}}void free_blocks_array(){	free(blocks);	free(block_sizes_z);}/* *	Initialization of the blocks */void assign_blocks_to_workers(int rank){	unsigned bz;	/* NB: perhaps we could count a GPU as multiple workers */	/* how many workers are there ? */	/*unsigned nworkers = starpu_worker_get_count();*/	/* how many blocks are on that MPI node ? *///	unsigned nblocks = 0;//	for (bz = 0; bz < nbz; bz++)//	{//		struct block_description *block =//				get_block_description(bz);////		if (block->mpi_node == rank)//			nblocks++;//	}	/* how many blocks per worker ? */	/*unsigned nblocks_per_worker = (nblocks + nworkers - 1)/nworkers;*/	/* we now attribute up to nblocks_per_worker blocks per workers */	unsigned attributed = 0;	for (bz = 0; bz < nbz; bz++)	{		struct block_description *block =				get_block_description(bz);		if (block->mpi_node == rank)		{			unsigned workerid;			/* Manage initial block distribution between CPU and GPU */		#if 0			#if 1			/* GPUs then CPUs */			if (attributed < 3*18)				workerid = attributed / 18;			else				workerid = 3+ (attributed - 3*18) / 2;			#else			/* GPUs interleaved with CPUs */			if ((attributed % 20) <= 1)				workerid = 3 + attributed / 20;			else if (attributed < 60)				workerid = attributed / 20;			else				workerid = (attributed - 60)/2 + 6;			#endif		#else			/* Only GPUS */			workerid = (attributed / 21) % 3;		#endif			/*= attributed/nblocks_per_worker;*/			block->preferred_worker = workerid;			attributed++;		}	}}void assign_blocks_to_mpi_nodes(int world_size){	unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;	unsigned bz;	for (bz = 0; bz < nbz; bz++)	{		struct block_description *block =				get_block_description(bz);		block->mpi_node = bz / nzblocks_per_process;	}}static size_t allocated = 0;static void allocate_block_on_node(starpu_data_handle_t *handleptr, unsigned bz, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz){	int ret;	size_t block_size = nx*ny*nz*sizeof(TYPE);	/* Allocate memory */#if 1	ret = starpu_malloc_flags((void **)ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);	STARPU_ASSERT(ret == 0);#else	*ptr = malloc(block_size);	STARPU_ASSERT(*ptr);#endif	allocated += block_size;//#ifndef STARPU_SIMGRID//	/* Fill the blocks with 0 *///	memset(*ptr, 0, block_size);//#endif	/* Register it to StarPU */	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));	starpu_data_set_coordinates(*handleptr, 1, bz);}static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsigned ny, unsigned nz){	void *ptr = (void *) starpu_block_get_local_ptr(handleptr);	size_t block_size = nx*ny*nz*sizeof(TYPE);	starpu_data_unregister(handleptr);	starpu_free_flags(ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);}void display_memory_consumption(int rank, double time){	FPRINTF(stderr, "%lu B of memory were allocated on node %d in %f ms\n", (unsigned long)allocated, rank, time/1000);}void allocate_memory_on_node(int rank){	unsigned bz;	/* Correctly allocate and declare all data handles to StarPU. */	for (bz = 0; bz < nbz; bz++)	{		struct block_description *block = get_block_description(bz);		int node = block->mpi_node;		unsigned size_bz = block_sizes_z[bz];		if (node == rank)		{			/* Main blocks */			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],					       (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));			allocate_block_on_node(&block->layers_handle[1], bz, &block->layers[1],					       (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));			/* Boundary blocks : Top */			allocate_block_on_node(&block->boundaries_handle[T][0], bz, &block->boundaries[T][0],					       (sizex + 2*K), (sizey + 2*K), K);			allocate_block_on_node(&block->boundaries_handle[T][1], bz, &block->boundaries[T][1],					       (sizex + 2*K), (sizey + 2*K), K);			/* Boundary blocks : Bottom */			allocate_block_on_node(&block->boundaries_handle[B][0], bz, &block->boundaries[B][0],					       (sizex + 2*K), (sizey + 2*K), K);			allocate_block_on_node(&block->boundaries_handle[B][1], bz, &block->boundaries[B][1],					       (sizex + 2*K), (sizey + 2*K), K);		}		/* Register void blocks to StarPU, that StarPU-MPI will request to		 * neighbour nodes if needed for the local computation */		else		{			/* Main blocks */			starpu_block_data_register(&block->layers_handle[0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K), sizeof(TYPE));			starpu_block_data_register(&block->layers_handle[1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K), sizeof(TYPE));			/* Boundary blocks : Top */			starpu_block_data_register(&block->boundaries_handle[T][0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));			starpu_block_data_register(&block->boundaries_handle[T][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));			/* Boundary blocks : Bottom */			starpu_block_data_register(&block->boundaries_handle[B][0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));			starpu_block_data_register(&block->boundaries_handle[B][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));		}#if defined(STARPU_USE_MPI)  && !defined(STARPU_USE_MPI_MASTER_SLAVE)		/* Register all data to StarPU-MPI, even the ones that are not		 * allocated on the local node. */		/* Main blocks */		starpu_mpi_data_register(block->layers_handle[0], MPI_TAG_LAYERS(bz, 0), node);		starpu_mpi_data_register(block->layers_handle[1], MPI_TAG_LAYERS(bz, 1), node);		/* Boundary blocks : Top */		starpu_mpi_data_register(block->boundaries_handle[T][0], MPI_TAG_BOUNDARIES(bz, T, 0), node);		starpu_mpi_data_register(block->boundaries_handle[T][1], MPI_TAG_BOUNDARIES(bz, T, 1), node);		/* Boundary blocks : Bottom */		starpu_mpi_data_register(block->boundaries_handle[B][0], MPI_TAG_BOUNDARIES(bz, B, 0), node);		starpu_mpi_data_register(block->boundaries_handle[B][1], MPI_TAG_BOUNDARIES(bz, B, 1), node);#endif	}	/* Initialize all the data in parallel */	for (bz = 0; bz < nbz; bz++)	{		struct block_description *block = get_block_description(bz);		int node = block->mpi_node;		if (node == rank)		{			/* Set all the data to 0 */			create_task_memset(sizex, sizey, bz);			/* Initialize the first layer with some random data */			create_task_initlayer(sizex, sizey, bz);		}	}	starpu_task_wait_for_all();}void free_memory_on_node(int rank){	unsigned bz;	for (bz = 0; bz < nbz; bz++)	{		struct block_description *block = get_block_description(bz);		int node = block->mpi_node;		/* Main blocks */		if (node == rank)		{			free_block_on_node(block->layers_handle[0], (sizex + 2*K), (sizey + 2*K), K);			free_block_on_node(block->layers_handle[1], (sizex + 2*K), (sizey + 2*K), K);		}        else        {            starpu_data_unregister(block->layers_handle[0]);            starpu_data_unregister(block->layers_handle[1]);        }		/* Boundary blocks : Top */		if (node == rank)		{			free_block_on_node(block->boundaries_handle[T][0], (sizex + 2*K), (sizey + 2*K), K);			free_block_on_node(block->boundaries_handle[T][1], (sizex + 2*K), (sizey + 2*K), K);		}        else        {            starpu_data_unregister(block->boundaries_handle[T][0]);            starpu_data_unregister(block->boundaries_handle[T][1]);        }		/* Boundary blocks : Bottom */		if (node == rank)		{			free_block_on_node(block->boundaries_handle[B][0], (sizex + 2*K), (sizey + 2*K), K);			free_block_on_node(block->boundaries_handle[B][1], (sizex + 2*K), (sizey + 2*K), K);		}        else        {            starpu_data_unregister(block->boundaries_handle[B][0]);            starpu_data_unregister(block->boundaries_handle[B][1]);        }	}}/* check how many cells are alive */void check(int rank){	unsigned bz;	for (bz = 0; bz < nbz; bz++)	{		struct block_description *block = get_block_description(bz);		int node = block->mpi_node;		/* Main blocks */		if (node == rank)		{			unsigned size_bz = block_sizes_z[bz];#ifdef LIFE			unsigned x, y, z;			unsigned sum = 0;			for (x = 0; x < sizex; x++)				for (y = 0; y < sizey; y++)					for (z = 0; z < size_bz; z++)						sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)];			printf("block %u got %u/%u alive\n", bz, sum, sizex*sizey*size_bz);#endif		}	}}
 |