| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369 | /* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2010-2015  Université de Bordeaux * Copyright (C) 2011, 2012, 2013       Centre National de la Recherche Scientifique * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */#include <starpu.h>#include <common/utils.h>#include <core/workers.h>#include <math.h>#include <core/detect_combined_workers.h>#ifdef STARPU_HAVE_HWLOC#include <hwloc.h>static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], unsigned *n){	if (!obj->userdata)		/* Not something we run something on, don't care */		return;	if (obj->userdata == (void*) -1)	{		/* Intra node, recurse */		unsigned i;		for (i = 0; i < obj->arity; i++)			find_workers(obj->children[i], cpu_workers, n);		return;	}	/* Got to a PU leaf */	struct _starpu_worker_list *workers = obj->userdata;	struct _starpu_worker *worker;	for(worker = _starpu_worker_list_begin(workers); worker != _starpu_worker_list_end(workers); worker = _starpu_worker_list_next(worker))	{		/* is it a CPU worker? */		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 1)		{			_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);			/* Add it to the combined worker */			cpu_workers[(*n)++] = worker->workerid;		}	}}static void synthesize_intermediate_workers(hwloc_obj_t *children, unsigned min, unsigned max, unsigned arity, unsigned n, unsigned synthesize_arity){	unsigned nworkers, i, j;	unsigned chunk_size = (n + synthesize_arity-1) / synthesize_arity;	unsigned chunk_start;	int cpu_workers[STARPU_NMAXWORKERS];	int ret;	if (n <= synthesize_arity)		/* Not too many children, do not synthesize */		return;	_STARPU_DEBUG("%u children > %u, synthesizing intermediate combined workers of size %u\n", n, synthesize_arity, chunk_size);	n = 0;	j = 0;	nworkers = 0;	chunk_start = 0;	for (i = 0 ; i < arity; i++)	{		if (children[i]->userdata)		{			n++;			_STARPU_DEBUG("child %u\n", i);			find_workers(children[i], cpu_workers, &nworkers);			j++;		}		/* Completed a chunk, or last bit (but not if it's just 1 subobject) */		if (j == chunk_size || (i == arity-1 && j > 1))		{			if (nworkers >= min && nworkers <= max)			{				unsigned sched_ctx_id  = starpu_sched_ctx_get_context();				if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)					sched_ctx_id = 0;				struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);				_STARPU_DEBUG("Adding it\n");				ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);				STARPU_ASSERT(ret >= 0);				workers->add(workers,ret);			}			/* Recurse there */			synthesize_intermediate_workers(children+chunk_start, min, max, i - chunk_start, n, synthesize_arity);			/* And restart another one */			n = 0;			j = 0;			nworkers = 0;			chunk_start = i+1;		}	}}static void find_and_assign_combinations(hwloc_obj_t obj, unsigned min, unsigned max, unsigned synthesize_arity){	char name[64];	unsigned i, n, nworkers;	int cpu_workers[STARPU_NMAXWORKERS];	struct _starpu_machine_config *config = _starpu_get_machine_config();	struct _starpu_machine_topology *topology = &config->topology;	hwloc_obj_snprintf(name, sizeof(name), topology->hwtopology, obj, "#", 0);	_STARPU_DEBUG("Looking at %s\n", name);	for (n = 0, i = 0; i < obj->arity; i++)		if (obj->children[i]->userdata)			/* it has a CPU worker */			n++;	if (n == 1)	{		/* If there is only one child, we go to the next level right away */		find_and_assign_combinations(obj->children[0], min, max, synthesize_arity);		return;	}	/* Add this object */	nworkers = 0;	find_workers(obj, cpu_workers, &nworkers);	if (nworkers >= min && nworkers <= max)	{		_STARPU_DEBUG("Adding it\n");		unsigned sched_ctx_id  = starpu_sched_ctx_get_context();		if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)			sched_ctx_id = 0;		struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);		int newworkerid = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);		STARPU_ASSERT(newworkerid >= 0);		workers->add(workers,newworkerid);	}	/* Add artificial intermediate objects recursively */	synthesize_intermediate_workers(obj->children, min, max, obj->arity, n, synthesize_arity);	/* And recurse */	for (i = 0; i < obj->arity; i++)		if (obj->children[i]->userdata == (void*) -1)			find_and_assign_combinations(obj->children[i], min, max, synthesize_arity);}static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers){	struct _starpu_machine_config *config = _starpu_get_machine_config();	struct _starpu_machine_topology *topology = &config->topology;	int synthesize_arity = starpu_get_env_number("STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER");	int min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");	if (min < 2)		min = 2;	int max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");	if (max == -1)		max = INT_MAX;	if (synthesize_arity == -1)		synthesize_arity = 2;	/* First, mark nodes which contain CPU workers, simply by setting their userdata field */	int i;	for (i = 0; i < nworkers; i++)	{		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);		if (worker->perf_arch.devices[0].type == STARPU_CPU_WORKER && worker->perf_arch.devices[0].ncores == 1)		{			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->pu_depth, worker->bindid);			obj = obj->parent;			while (obj)			{				obj->userdata = (void*) -1;				obj = obj->parent;			}		}	}	find_and_assign_combinations(hwloc_get_root_obj(topology->hwtopology), min, max, synthesize_arity);}#else /* STARPU_HAVE_HWLOC */static void assign_combinations_without_hwloc(struct starpu_worker_collection* worker_collection, int* workers, unsigned n, int min, int max){	int size,i,count =0;	//if the maximun number of worker is already reached	if(worker_collection->nworkers >= STARPU_NMAXWORKERS - 1)		return;	for (size = min; size <= max; size *= 2)	{		unsigned first;		for (first = 0; first < n; first += size)		{			if (first + size <= n)			{				int found_workerids[size];				for (i = 0; i < size; i++)					found_workerids[i] = workers[first + i];				/* We register this combination */				int newworkerid;				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);				STARPU_ASSERT(newworkerid >= 0);				count++;				worker_collection->add(worker_collection, newworkerid);				//if the maximun number of worker is reached, then return				if(worker_collection->nworkers >= STARPU_NMAXWORKERS - 1)					return;			}		}	}}static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers){	int i;	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)		sched_ctx_id = 0;	int min, max;#ifdef STARPU_USE_MIC	unsigned j;	int mic_min, mic_max;#endif	struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);	/* We put the id of all CPU workers in this array */	int cpu_workers[STARPU_NMAXWORKERS];	unsigned ncpus = 0;#ifdef STARPU_USE_MIC	unsigned nb_mics = _starpu_get_machine_config()->topology.nmicdevices;	unsigned * nmics_table;	int * mic_id;	int ** mic_workers;	mic_id = malloc(sizeof(int)*nb_mics);	nmics_table = malloc(sizeof(unsigned)*nb_mics);	mic_workers = malloc(sizeof(int*)*nb_mics);	for(j=0; j<nb_mics; j++)	{		mic_id[j] = -1;		nmics_table[j] = 0;		mic_workers[j] = malloc(sizeof(int)*STARPU_NMAXWORKERS);	}#endif /* STARPU_USE_MIC */	struct _starpu_worker *worker;	for (i = 0; i < nworkers; i++)	{		worker = _starpu_get_worker_struct(workerids[i]);		if (worker->arch == STARPU_CPU_WORKER)			cpu_workers[ncpus++] = i;#ifdef STARPU_USE_MIC		else if(worker->arch == STARPU_MIC_WORKER)		{			for(j=0; mic_id[j] != worker->devid && mic_id[j] != -1 && j<nb_mics; j++);			if(j<nb_mics)			{				if(mic_id[j] == -1)				{					mic_id[j] = worker->devid;									}				mic_workers[j][nmics_table[j]++] = i;			}		}#endif /* STARPU_USE_MIC */	}	min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");	if (min < 2)		min = 2;	max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");	if (max == -1 || max > (int) ncpus)		max = ncpus;		assign_combinations_without_hwloc(workers,cpu_workers,ncpus,min,max);#ifdef STARPU_USE_MIC	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");	if (mic_min < 2)		mic_min = 2;	for(j=0; j<nb_mics; j++)	{		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");		if (mic_max == -1 || mic_max > (int) nmics_table[j])			mic_max = nmics_table[j];		assign_combinations_without_hwloc(workers,mic_workers[j],nmics_table[j],mic_min,mic_max);		free(mic_workers[j]);	}	free(mic_id);	free(nmics_table);	free(mic_workers);#endif /* STARPU_USE_MIC */}#endif /* STARPU_HAVE_HWLOC */static void combine_all_cpu_workers(int *workerids, int nworkers){	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)		sched_ctx_id = 0;	struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);	int cpu_workers[STARPU_NMAXWORKERS];	int ncpus = 0;	struct _starpu_worker *worker;	int i;	int min;	int max;	for (i = 0; i < nworkers; i++)	{		worker = _starpu_get_worker_struct(workerids[i]);		if (worker->arch == STARPU_CPU_WORKER)			cpu_workers[ncpus++] = workerids[i];	}	min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");	if (min < 1)		min = 1;	max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");	if (max == -1 || max > ncpus)		max = ncpus;	for (i = min; i <= max; i++)	{		int newworkerid;		newworkerid = starpu_combined_worker_assign_workerid(i, cpu_workers);		STARPU_ASSERT(newworkerid >= 0);		workers->add(workers, newworkerid);	}}void _starpu_sched_find_worker_combinations(int *workerids, int nworkers){	struct _starpu_machine_config *config = _starpu_get_machine_config();	if (config->conf->single_combined_worker > 0)		combine_all_cpu_workers(workerids, nworkers);	else	{#ifdef STARPU_HAVE_HWLOC		find_and_assign_combinations_with_hwloc(workerids, nworkers);#else		find_and_assign_combinations_without_hwloc(workerids, nworkers);#endif	}}
 |