Procházet zdrojové kódy

inline the trivial functions accessing starpu_config

Samuel Thibault před 9 roky
rodič
revize
160b44ac45

+ 4 - 4
src/core/sched_ctx.c

@@ -1910,6 +1910,9 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid)
 
 unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)
 {
+	if (_starpu_get_nsched_ctxs() <= 1)
+		return STARPU_NMAX_SCHED_CTXS;
+
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 	struct _starpu_sched_ctx_elt *e = NULL;
 	struct _starpu_sched_ctx_list_iterator list_it;
@@ -1944,11 +1947,8 @@ unsigned starpu_sched_ctx_master_get_context(int masterid)
 	return STARPU_NMAX_SCHED_CTXS;
 }
 
-struct _starpu_sched_ctx *_starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j)
+struct _starpu_sched_ctx *__starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j)
 {
-	if (_starpu_get_nsched_ctxs() == 1)
-		return _starpu_get_sched_ctx_struct(0);
-
 	struct _starpu_sched_ctx_elt *e = NULL;
 	struct _starpu_sched_ctx_list_iterator list_it;
 	struct _starpu_sched_ctx *sched_ctx = NULL;

+ 4 - 4
src/core/sched_ctx.h

@@ -207,9 +207,6 @@ int _starpu_wait_for_no_ready_of_sched_ctx(unsigned sched_ctx_id);
 /* Return the corresponding index of the workerid in the ctx table */
 int _starpu_get_index_in_ctx_of_workerid(unsigned sched_ctx, unsigned workerid);
 
-/* Get the total number of sched_ctxs created till now */
-unsigned _starpu_get_nsched_ctxs();
-
 /* Get the mutex corresponding to the global workerid */
 starpu_pthread_mutex_t *_starpu_get_sched_mutex(struct _starpu_sched_ctx *sched_ctx, int worker);
 
@@ -256,6 +253,9 @@ void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task,
 #endif //STARPU_USE_SC_HYPERVISOR
 
 /* if the worker is the master of a parallel context, and the job is meant to be executed on this parallel context, return a pointer to the context */
-struct _starpu_sched_ctx *_starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j);
+struct _starpu_sched_ctx *__starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j);
+
+#define _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(w,j) \
+	(_starpu_get_nsched_ctxs() <= 1 ? _starpu_get_sched_ctx_struct(0) : __starpu_sched_ctx_get_sched_ctx_for_worker_and_job((w),(j)))
 
 #endif // __SCHED_CONTEXT_H__

+ 0 - 13
src/core/task.c

@@ -285,19 +285,6 @@ int _starpu_task_test_termination(struct starpu_task *task)
 }
 #endif
 
-struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task)
-{
-	STARPU_ASSERT(task);
-
-	if (!task->starpu_private)
-	{
-		struct _starpu_job *j = _starpu_job_create(task);
-		task->starpu_private = j;
-	}
-
-	return (struct _starpu_job *)task->starpu_private;
-}
-
 /* NB in case we have a regenerable task, it is possible that the job was
  * already counted. */
 int _starpu_submit_job(struct _starpu_job *j)

+ 14 - 2
src/core/task.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2013, 2015  CNRS
  * Copyright (C) 2011, 2014 INRIA
  *
@@ -49,7 +49,19 @@ void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, s
 
 /* Returns the job structure (which is the internal data structure associated
  * to a task). */
-struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task);
+static inline struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task)
+{
+	STARPU_ASSERT(task);
+	struct _starpu_job *job = task->starpu_private;
+
+	if (STARPU_UNLIKELY(!job))
+	{
+		job = _starpu_job_create(task);
+		task->starpu_private = job;
+	}
+
+	return job;
+}
 
 /* Submits starpu internal tasks to the initial context */
 int _starpu_task_submit_internally(struct starpu_task *task);

+ 113 - 211
src/core/workers.c

@@ -53,14 +53,13 @@ static starpu_pthread_cond_t init_cond = STARPU_PTHREAD_COND_INITIALIZER;
 static int init_count = 0;
 static enum { UNINITIALIZED, CHANGING, INITIALIZED } initialized = UNINITIALIZED;
 
-static int keys_initialized;
-static starpu_pthread_key_t worker_key;
-static starpu_pthread_key_t worker_set_key;
+int _starpu_keys_initialized STARPU_ATTRIBUTE_INTERNAL;
+starpu_pthread_key_t _starpu_worker_key STARPU_ATTRIBUTE_INTERNAL;
+starpu_pthread_key_t _starpu_worker_set_key STARPU_ATTRIBUTE_INTERNAL;
 
-static struct _starpu_machine_config config;
+struct _starpu_machine_config _starpu_config STARPU_ATTRIBUTE_INTERNAL;
 
 static int check_entire_platform;
-static int disable_kernels;
 
 /* Pointers to argc and argv
  */
@@ -90,16 +89,6 @@ int _starpu_is_initialized(void)
 	return initialized == INITIALIZED;
 }
 
-struct _starpu_machine_config *_starpu_get_machine_config(void)
-{
-	return &config;
-}
-
-int _starpu_get_disable_kernels(void)
-{
-	return disable_kernels;
-}
-
 /* Makes sure that at least one of the workers of type <arch> can execute
  * <task>, for at least one of its implementations. */
 static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
@@ -184,7 +173,7 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 	   and verify if it exists a worker able to exec the task */
 	if(task->sched_ctx == 0)
 	{
-		if (!(task->cl->where & config.worker_mask))
+		if (!(task->cl->where & _starpu_config.worker_mask))
 			return 0;
 
 		if (!task->cl->can_execute)
@@ -222,22 +211,22 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 
 uint32_t _starpu_can_submit_cuda_task(void)
 {
-	return (STARPU_CUDA & config.worker_mask);
+	return (STARPU_CUDA & _starpu_config.worker_mask);
 }
 
 uint32_t _starpu_can_submit_cpu_task(void)
 {
-	return (STARPU_CPU & config.worker_mask);
+	return (STARPU_CPU & _starpu_config.worker_mask);
 }
 
 uint32_t _starpu_can_submit_opencl_task(void)
 {
-	return (STARPU_OPENCL & config.worker_mask);
+	return (STARPU_OPENCL & _starpu_config.worker_mask);
 }
 
 uint32_t _starpu_can_submit_scc_task(void)
 {
-	return (STARPU_SCC & config.worker_mask);
+	return (STARPU_SCC & _starpu_config.worker_mask);
 }
 
 static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
@@ -307,8 +296,8 @@ int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task,
 	if(sched_ctx->parallel_sect[workerid] ) return 0;
 
 	/* TODO: check that the task operand sizes will fit on that device */
-	return (task->cl->where & config.workers[workerid].worker_mask) &&
-		_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl) &&
+	return (task->cl->where & _starpu_config.workers[workerid].worker_mask) &&
+		_starpu_can_use_nth_implementation(_starpu_config.workers[workerid].arch, task->cl, nimpl) &&
 		(!task->cl->can_execute || task->cl->can_execute(workerid, task, nimpl));
 }
 
@@ -325,10 +314,10 @@ int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *t
 	struct starpu_codelet *cl;
 	/* TODO: check that the task operand sizes will fit on that device */
 	cl = task->cl;
-	if (!(cl->where & config.workers[workerid].worker_mask)) return 0;
+	if (!(cl->where & _starpu_config.workers[workerid].worker_mask)) return 0;
 
 	mask = 0;
-	arch = config.workers[workerid].arch;
+	arch = _starpu_config.workers[workerid].arch;
 	if (!task->cl->can_execute)
 	{
 		for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
@@ -364,9 +353,9 @@ int starpu_worker_can_execute_task_first_impl(unsigned workerid, struct starpu_t
 	if(sched_ctx->parallel_sect[workerid]) return 0;
 	/* TODO: check that the task operand sizes will fit on that device */
 	cl = task->cl;
-	if (!(cl->where & config.workers[workerid].worker_mask)) return 0;
+	if (!(cl->where & _starpu_config.workers[workerid].worker_mask)) return 0;
 
-	arch = config.workers[workerid].arch;
+	arch = _starpu_config.workers[workerid].arch;
 	if (!task->cl->can_execute)
 	{
 		for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
@@ -398,13 +387,13 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 	/* TODO: check that the task operand sizes will fit on that device */
 
 	struct starpu_codelet *cl = task->cl;
-	unsigned nworkers = config.topology.nworkers;
+	unsigned nworkers = _starpu_config.topology.nworkers;
 
 	/* Is this a parallel worker ? */
 	if (workerid < nworkers)
 	{
-		return !!((task->cl->where & config.workers[workerid].worker_mask) &&
-				_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl) &&
+		return !!((task->cl->where & _starpu_config.workers[workerid].worker_mask) &&
+				_starpu_can_use_nth_implementation(_starpu_config.workers[workerid].arch, task->cl, nimpl) &&
 				(!task->cl->can_execute || task->cl->can_execute(workerid, task, nimpl)));
 	}
 	else
@@ -423,10 +412,10 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 			/* TODO we should add other types of constraints */
 
 			/* Is the worker larger than requested ? */
-			int worker_size = (int)config.combined_workers[workerid - nworkers].worker_size;
-			int worker0 = config.combined_workers[workerid - nworkers].combined_workerid[0];
+			int worker_size = (int)_starpu_config.combined_workers[workerid - nworkers].worker_size;
+			int worker0 = _starpu_config.combined_workers[workerid - nworkers].combined_workerid[0];
 			return !!((worker_size <= task->cl->max_parallelism) &&
-				_starpu_can_use_nth_implementation(config.workers[worker0].arch, task->cl, nimpl) &&
+				_starpu_can_use_nth_implementation(_starpu_config.workers[worker0].arch, task->cl, nimpl) &&
 				(!task->cl->can_execute || task->cl->can_execute(workerid, task, nimpl)));
 		}
 		else
@@ -876,32 +865,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 	_STARPU_DEBUG("finished launching drivers\n");
 }
 
-void _starpu_set_local_worker_key(struct _starpu_worker *worker)
-{
-	STARPU_ASSERT(keys_initialized);
-	STARPU_PTHREAD_SETSPECIFIC(worker_key, worker);
-}
-
-struct _starpu_worker *_starpu_get_local_worker_key(void)
-{
-	if (!keys_initialized)
-		return NULL;
-	return (struct _starpu_worker *) STARPU_PTHREAD_GETSPECIFIC(worker_key);
-}
-
-void _starpu_set_local_worker_set_key(struct _starpu_worker_set *worker)
-{
-	STARPU_ASSERT(keys_initialized);
-	STARPU_PTHREAD_SETSPECIFIC(worker_set_key, worker);
-}
-
-struct _starpu_worker_set *_starpu_get_local_worker_set_key(void)
-{
-	if (!keys_initialized)
-		return NULL;
-	return (struct _starpu_worker_set *) STARPU_PTHREAD_GETSPECIFIC(worker_set_key);
-}
-
 /* Initialize the starpu_conf with default values */
 int starpu_conf_init(struct starpu_conf *conf)
 {
@@ -1026,7 +989,7 @@ void _starpu_conf_check_environment(struct starpu_conf *conf)
 
 struct starpu_tree* starpu_workers_get_tree(void)
 {
-	return config.topology.tree;
+	return _starpu_config.topology.tree;
 }
 
 #ifdef STARPU_HAVE_HWLOC
@@ -1048,9 +1011,9 @@ static void _starpu_build_tree(void)
 {
 #ifdef STARPU_HAVE_HWLOC
 	struct starpu_tree* tree = (struct starpu_tree*)malloc(sizeof(struct starpu_tree));
-	config.topology.tree = tree;
+	_starpu_config.topology.tree = tree;
 
-	hwloc_obj_t root = hwloc_get_root_obj(config.topology.hwtopology);
+	hwloc_obj_t root = hwloc_get_root_obj(_starpu_config.topology.hwtopology);
 
 /* 	char string[128]; */
 /* 	hwloc_obj_snprintf(string, sizeof(string), topology, root, "#", 0); */
@@ -1058,7 +1021,7 @@ static void _starpu_build_tree(void)
 
 	/* level, is_pu, is in the tree (it will be true only after add*/
 	starpu_tree_insert(tree, root->logical_index, 0,root->type == HWLOC_OBJ_PU, root->arity, NULL);
-	_fill_tree(tree, root, 1, config.topology.hwtopology);
+	_fill_tree(tree, root, 1, _starpu_config.topology.hwtopology);
 #endif
 }
 
@@ -1182,7 +1145,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	/* store the pointer to the user explicit configuration during the
 	 * initialization */
 	if (user_conf == NULL)
-		 starpu_conf_init(&config.conf);
+		 starpu_conf_init(&_starpu_config.conf);
 	else
 	{
 		if (user_conf->magic != 42)
@@ -1190,37 +1153,37 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 			_STARPU_DISP("starpu_conf structure needs to be initialized with starpu_conf_init\n");
 			return -EINVAL;
 		}
-		config.conf = *user_conf;
+		_starpu_config.conf = *user_conf;
 	}
-	_starpu_conf_check_environment(&config.conf);
+	_starpu_conf_check_environment(&_starpu_config.conf);
 
 	/* Make a copy of arrays */
-	if (config.conf.sched_policy_name)
-		config.conf.sched_policy_name = strdup(config.conf.sched_policy_name);
-	if (config.conf.mic_sink_program_path)
-		config.conf.mic_sink_program_path = strdup(config.conf.mic_sink_program_path);
-	if (config.conf.n_cuda_opengl_interoperability)
+	if (_starpu_config.conf.sched_policy_name)
+		_starpu_config.conf.sched_policy_name = strdup(_starpu_config.conf.sched_policy_name);
+	if (_starpu_config.conf.mic_sink_program_path)
+		_starpu_config.conf.mic_sink_program_path = strdup(_starpu_config.conf.mic_sink_program_path);
+	if (_starpu_config.conf.n_cuda_opengl_interoperability)
 	{
-		size_t size = config.conf.n_cuda_opengl_interoperability * sizeof(*config.conf.cuda_opengl_interoperability);
+		size_t size = _starpu_config.conf.n_cuda_opengl_interoperability * sizeof(*_starpu_config.conf.cuda_opengl_interoperability);
 		unsigned *copy = malloc(size);
-		memcpy(copy, config.conf.cuda_opengl_interoperability, size);
-		config.conf.cuda_opengl_interoperability = copy;
+		memcpy(copy, _starpu_config.conf.cuda_opengl_interoperability, size);
+		_starpu_config.conf.cuda_opengl_interoperability = copy;
 	}
-	if (config.conf.n_not_launched_drivers)
+	if (_starpu_config.conf.n_not_launched_drivers)
 	{
-		size_t size = config.conf.n_not_launched_drivers * sizeof(*config.conf.not_launched_drivers);
+		size_t size = _starpu_config.conf.n_not_launched_drivers * sizeof(*_starpu_config.conf.not_launched_drivers);
 		struct starpu_driver *copy = malloc(size);
-		memcpy(copy, config.conf.not_launched_drivers, size);
-		config.conf.not_launched_drivers = copy;
+		memcpy(copy, _starpu_config.conf.not_launched_drivers, size);
+		_starpu_config.conf.not_launched_drivers = copy;
 	}
 
-	_starpu_init_all_sched_ctxs(&config);
+	_starpu_init_all_sched_ctxs(&_starpu_config);
 	_starpu_init_progression_hooks();
 
 	_starpu_init_tags();
 
 #ifdef STARPU_USE_FXT
-	_starpu_fxt_init_profiling(config.conf.trace_buffer_size);
+	_starpu_fxt_init_profiling(_starpu_config.conf.trace_buffer_size);
 #endif
 
 	_starpu_open_debug_logfile();
@@ -1235,14 +1198,14 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 	/* Depending on whether we are a MP sink or not, we must build the
 	 * topology with MP nodes or not. */
-	ret = _starpu_build_topology(&config, is_a_sink);
+	ret = _starpu_build_topology(&_starpu_config, is_a_sink);
 	if (ret)
 	{
 		starpu_perfmodel_free_sampling_directories();
 		STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 		init_count--;
 
-		_starpu_destroy_machine_config(&config);
+		_starpu_destroy_machine_config(&_starpu_config);
 
 #ifdef STARPU_USE_SCC
 		if (_starpu_scc_common_is_mp_initialized())
@@ -1257,28 +1220,28 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 	_starpu_task_init();
 
-	for (worker = 0; worker < config.topology.nworkers; worker++)
-		_starpu_worker_init(&config.workers[worker], &config);
+	for (worker = 0; worker < _starpu_config.topology.nworkers; worker++)
+		_starpu_worker_init(&_starpu_config.workers[worker], &_starpu_config);
 
 	check_entire_platform = starpu_get_env_number("STARPU_CHECK_ENTIRE_PLATFORM");
-	disable_kernels = starpu_get_env_number("STARPU_DISABLE_KERNELS");
-	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
-	STARPU_PTHREAD_KEY_CREATE(&worker_set_key, NULL);
-	keys_initialized = 1;
+	_starpu_config.disable_kernels = starpu_get_env_number("STARPU_DISABLE_KERNELS");
+	STARPU_PTHREAD_KEY_CREATE(&_starpu_worker_key, NULL);
+	STARPU_PTHREAD_KEY_CREATE(&_starpu_worker_set_key, NULL);
+	_starpu_keys_initialized = 1;
 
 	_starpu_build_tree();
 
 	if (!is_a_sink)
 	{
-		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&config, config.conf.sched_policy_name);
-		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", (config.conf.global_sched_ctx_min_priority != -1), config.conf.global_sched_ctx_min_priority, (config.conf.global_sched_ctx_min_priority != -1), config.conf.global_sched_ctx_max_priority, 1, config.conf.sched_policy_init);
+		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&_starpu_config, _starpu_config.conf.sched_policy_name);
+		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_min_priority, (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_max_priority, 1, _starpu_config.conf.sched_policy_init);
 	}
 
 	_starpu_initialize_registered_performance_models();
 
 	/* Launch "basic" workers (ie. non-combined workers) */
 	if (!is_a_sink)
-		_starpu_launch_drivers(&config);
+		_starpu_launch_drivers(&_starpu_config);
 
 	/* Allocate swap, if any */
 	_starpu_swap_init();
@@ -1387,10 +1350,10 @@ void _starpu_may_pause(void)
 	/* pause_depth is just protected by a memory barrier */
 	STARPU_RMB();
 
-	if (STARPU_UNLIKELY(config.pause_depth > 0))
+	if (STARPU_UNLIKELY(_starpu_config.pause_depth > 0))
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&pause_mutex);
-		if (config.pause_depth > 0)
+		if (_starpu_config.pause_depth > 0)
 		{
 			STARPU_PTHREAD_COND_WAIT(&pause_cond, &pause_mutex);
 		}
@@ -1398,29 +1361,17 @@ void _starpu_may_pause(void)
 	}
 }
 
-unsigned _starpu_machine_is_running(void)
-{
-	unsigned ret;
-	/* running is just protected by a memory barrier */
-	STARPU_RMB();
-
-	ANNOTATE_HAPPENS_AFTER(&config.running);
-	ret = config.running;
-	ANNOTATE_HAPPENS_BEFORE(&config.running);
-	return ret;
-}
-
 void starpu_pause()
 {
-	STARPU_HG_DISABLE_CHECKING(config.pause_depth);
-	config.pause_depth += 1;
+	STARPU_HG_DISABLE_CHECKING(_starpu_config.pause_depth);
+	_starpu_config.pause_depth += 1;
 }
 
 void starpu_resume()
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&pause_mutex);
-	config.pause_depth -= 1;
-	if (!config.pause_depth)
+	_starpu_config.pause_depth -= 1;
+	if (!_starpu_config.pause_depth)
 	{
 		STARPU_PTHREAD_COND_BROADCAST(&pause_cond);
 	}
@@ -1452,7 +1403,7 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED, stru
 	default:
 		goto always_launch;
 	}
-	if (!_starpu_may_launch_driver(&config.conf, &driver))
+	if (!_starpu_may_launch_driver(&_starpu_config.conf, &driver))
 		return 0;
 
 always_launch:
@@ -1475,10 +1426,10 @@ always_launch:
 static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
 {
 	/* set the flag which will tell workers to stop */
-	ANNOTATE_HAPPENS_AFTER(&config.running);
+	ANNOTATE_HAPPENS_AFTER(&_starpu_config.running);
 	pconfig->running = 0;
 	/* running is just protected by a memory barrier */
-	ANNOTATE_HAPPENS_BEFORE(&config.running);
+	ANNOTATE_HAPPENS_BEFORE(&_starpu_config.running);
 	STARPU_WMB();
 	starpu_wake_all_blocked_workers();
 }
@@ -1507,12 +1458,12 @@ void starpu_shutdown(void)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
 	/* If the workers are frozen, no progress can be made. */
-	STARPU_ASSERT(config.pause_depth <= 0);
+	STARPU_ASSERT(_starpu_config.pause_depth <= 0);
 
 	starpu_task_wait_for_no_ready();
 
 	/* tell all workers to shutdown */
-	_starpu_kill_all_workers(&config);
+	_starpu_kill_all_workers(&_starpu_config);
 
 	_starpu_free_all_automatically_allocated_buffers(STARPU_MAIN_RAM);
 
@@ -1533,7 +1484,7 @@ void starpu_shutdown(void)
 	_starpu_watchdog_shutdown();
 
 	/* wait for their termination */
-	_starpu_terminate_workers(&config);
+	_starpu_terminate_workers(&_starpu_config);
 
 	{
 	     int stats = starpu_get_env_number("STARPU_MEMORY_STATS");
@@ -1547,17 +1498,17 @@ void starpu_shutdown(void)
 	_starpu_delete_all_sched_ctxs();
 	_starpu_sched_component_workers_destroy();
 
-	for (worker = 0; worker < config.topology.nworkers; worker++)
-		_starpu_worker_deinit(&config.workers[worker]);
+	for (worker = 0; worker < _starpu_config.topology.nworkers; worker++)
+		_starpu_worker_deinit(&_starpu_config.workers[worker]);
 
 	_starpu_profiling_terminate();
 
 	_starpu_disk_unregister();
 #ifdef STARPU_HAVE_HWLOC
-	starpu_tree_free(config.topology.tree);
-	free(config.topology.tree);
+	starpu_tree_free(_starpu_config.topology.tree);
+	free(_starpu_config.topology.tree);
 #endif
-	_starpu_destroy_topology(&config);
+	_starpu_destroy_topology(&_starpu_config);
 #ifdef STARPU_USE_FXT
 	_starpu_stop_fxt_profiling();
 #endif
@@ -1572,9 +1523,9 @@ void starpu_shutdown(void)
 #endif
 	_starpu_close_debug_logfile();
 
-	keys_initialized = 0;
-	STARPU_PTHREAD_KEY_DELETE(worker_key);
-	STARPU_PTHREAD_KEY_DELETE(worker_set_key);
+	_starpu_keys_initialized = 0;
+	STARPU_PTHREAD_KEY_DELETE(_starpu_worker_key);
+	STARPU_PTHREAD_KEY_DELETE(_starpu_worker_set_key);
 
 	_starpu_task_deinit();
 
@@ -1585,12 +1536,12 @@ void starpu_shutdown(void)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
 	/* Clear memory */
-	free((char*) config.conf.sched_policy_name);
-	free(config.conf.mic_sink_program_path);
-	if (config.conf.n_cuda_opengl_interoperability)
-		free(config.conf.cuda_opengl_interoperability);
-	if (config.conf.n_not_launched_drivers)
-		free(config.conf.not_launched_drivers);
+	free((char*) _starpu_config.conf.sched_policy_name);
+	free(_starpu_config.conf.mic_sink_program_path);
+	if (_starpu_config.conf.n_cuda_opengl_interoperability)
+		free(_starpu_config.conf.cuda_opengl_interoperability);
+	if (_starpu_config.conf.n_not_launched_drivers)
+		free(_starpu_config.conf.not_launched_drivers);
 
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event) AYU_event(AYU_FINISH, 0, NULL);
@@ -1607,17 +1558,17 @@ void starpu_shutdown(void)
 
 unsigned starpu_worker_get_count(void)
 {
-	return config.topology.nworkers;
+	return _starpu_config.topology.nworkers;
 }
 
 unsigned starpu_worker_is_blocked(int workerid)
 {
-	return config.workers[workerid].blocked;
+	return _starpu_config.workers[workerid].blocked;
 }
 
 unsigned starpu_worker_is_slave_somewhere(int workerid)
 {
-	return config.workers[workerid].is_slave_somewhere;
+	return _starpu_config.workers[workerid].is_slave_somewhere;
 }
 
 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
@@ -1625,19 +1576,19 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 	switch (type)
 	{
 		case STARPU_CPU_WORKER:
-			return config.topology.ncpus;
+			return _starpu_config.topology.ncpus;
 
 		case STARPU_CUDA_WORKER:
-			return config.topology.ncudagpus;
+			return _starpu_config.topology.ncudagpus;
 
 		case STARPU_OPENCL_WORKER:
-			return config.topology.nopenclgpus;
+			return _starpu_config.topology.nopenclgpus;
 
 		case STARPU_MIC_WORKER:
-			return config.topology.nmicdevices;
+			return _starpu_config.topology.nmicdevices;
 
 		case STARPU_SCC_WORKER:
-			return config.topology.nsccdevices;
+			return _starpu_config.topology.nsccdevices;
 
 		default:
 			return -EINVAL;
@@ -1646,42 +1597,42 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 
 unsigned starpu_combined_worker_get_count(void)
 {
-	return config.topology.ncombinedworkers;
+	return _starpu_config.topology.ncombinedworkers;
 }
 
 unsigned starpu_cpu_worker_get_count(void)
 {
-	return config.topology.ncpus;
+	return _starpu_config.topology.ncpus;
 }
 
 unsigned starpu_cuda_worker_get_count(void)
 {
-	return config.topology.ncudagpus;
+	return _starpu_config.topology.ncudagpus;
 }
 
 unsigned starpu_opencl_worker_get_count(void)
 {
-	return config.topology.nopenclgpus;
+	return _starpu_config.topology.nopenclgpus;
 }
 
 int starpu_asynchronous_copy_disabled(void)
 {
-	return config.conf.disable_asynchronous_copy;
+	return _starpu_config.conf.disable_asynchronous_copy;
 }
 
 int starpu_asynchronous_cuda_copy_disabled(void)
 {
-	return config.conf.disable_asynchronous_cuda_copy;
+	return _starpu_config.conf.disable_asynchronous_cuda_copy;
 }
 
 int starpu_asynchronous_opencl_copy_disabled(void)
 {
-	return config.conf.disable_asynchronous_opencl_copy;
+	return _starpu_config.conf.disable_asynchronous_opencl_copy;
 }
 
 int starpu_asynchronous_mic_copy_disabled(void)
 {
-	return config.conf.disable_asynchronous_mic_copy;
+	return _starpu_config.conf.disable_asynchronous_mic_copy;
 }
 
 unsigned starpu_mic_worker_get_count(void)
@@ -1689,14 +1640,14 @@ unsigned starpu_mic_worker_get_count(void)
 	int i = 0, count = 0;
 
 	for (i = 0; i < STARPU_MAXMICDEVS; i++)
-		count += config.topology.nmiccores[i];
+		count += _starpu_config.topology.nmiccores[i];
 
 	return count;
 }
 
 unsigned starpu_scc_worker_get_count(void)
 {
-	return config.topology.nsccdevices;
+	return _starpu_config.topology.nsccdevices;
 }
 
 /* When analyzing performance, it is useful to see what is the processing unit
@@ -1704,6 +1655,7 @@ unsigned starpu_scc_worker_get_count(void)
  * processing unit actually executing it, therefore it makes no sense to use it
  * within the callbacks of SPU functions for instance. If called by some thread
  * that is not controlled by StarPU, starpu_worker_get_id returns -1. */
+#undef starpu_worker_get_id
 int starpu_worker_get_id(void)
 {
 	struct _starpu_worker * worker;
@@ -1720,6 +1672,7 @@ int starpu_worker_get_id(void)
 		return -1;
 	}
 }
+#define starpu_worker_get_id _starpu_worker_get_id
 
 int starpu_combined_worker_get_id(void)
 {
@@ -1774,33 +1727,17 @@ int starpu_combined_worker_get_rank(void)
 
 int starpu_worker_get_subworkerid(int id)
 {
-	return config.workers[id].subworkerid;
+	return _starpu_config.workers[id].subworkerid;
 }
 
 int starpu_worker_get_devid(int id)
 {
-	return config.workers[id].devid;
-}
-
-struct _starpu_worker *_starpu_get_worker_struct(unsigned id)
-{
-	return &config.workers[id];
+	return _starpu_config.workers[id].devid;
 }
 
 unsigned starpu_worker_is_combined_worker(int id)
 {
-	return id >= (int)config.topology.nworkers;
-}
-
-unsigned _starpu_get_nsched_ctxs()
-{
-	return config.topology.nsched_ctxs;
-}
-
-struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
-{
-	if(id == STARPU_NMAX_SCHED_CTXS) return NULL;
-	return &config.sched_ctxs[id];
+	return id >= (int)_starpu_config.topology.nworkers;
 }
 
 struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id)
@@ -1810,12 +1747,12 @@ struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id)
 	//_STARPU_DEBUG("basic_worker_count:%d\n",basic_worker_count);
 
 	STARPU_ASSERT(id >= basic_worker_count);
-	return &config.combined_workers[id - basic_worker_count];
+	return &_starpu_config.combined_workers[id - basic_worker_count];
 }
 
 enum starpu_worker_archtype starpu_worker_get_type(int id)
 {
-	return config.workers[id].arch;
+	return _starpu_config.workers[id].arch;
 }
 
 int starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
@@ -1876,14 +1813,14 @@ int starpu_worker_get_by_devid(enum starpu_worker_archtype type, int devid)
 
 void starpu_worker_get_name(int id, char *dst, size_t maxlen)
 {
-	char *name = config.workers[id].name;
+	char *name = _starpu_config.workers[id].name;
 
 	snprintf(dst, maxlen, "%s", name);
 }
 
 int starpu_worker_get_bindid(int workerid)
 {
-	return config.workers[workerid].bindid;
+	return _starpu_config.workers[workerid].bindid;
 }
 
 int starpu_worker_get_workerids(int bindid, int *workerids)
@@ -1892,28 +1829,15 @@ int starpu_worker_get_workerids(int bindid, int *workerids)
 	int nw = 0;
 	unsigned id;
 	for (id = 0; id < nworkers; id++)
-		if (config.workers[id].bindid == bindid)
+		if (_starpu_config.workers[id].bindid == bindid)
 			workerids[nw++] = id;
 	return nw;
 }
 
-/* Retrieve the status which indicates what the worker is currently doing. */
-enum _starpu_worker_status _starpu_worker_get_status(int workerid)
-{
-	return config.workers[workerid].status;
-}
-
-/* Change the status of the worker which indicates what the worker is currently
- * doing (eg. executing a callback). */
-void _starpu_worker_set_status(int workerid, enum _starpu_worker_status status)
-{
-	config.workers[workerid].status = status;
-}
-
 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond)
 {
-	*sched_cond = &config.workers[workerid].sched_cond;
-	*sched_mutex = &config.workers[workerid].sched_mutex;
+	*sched_cond = &_starpu_config.workers[workerid].sched_cond;
+	*sched_mutex = &_starpu_config.workers[workerid].sched_mutex;
 }
 
 int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
@@ -1921,9 +1845,9 @@ int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starp
 #ifdef STARPU_SIMGRID
 	starpu_pthread_queue_broadcast(&_starpu_simgrid_task_queue[workerid]);
 #endif
-	if (config.workers[workerid].status == STATUS_SLEEPING)
+	if (_starpu_config.workers[workerid].status == STATUS_SLEEPING)
 	{
-		config.workers[workerid].status = STATUS_WAKING_UP;
+		_starpu_config.workers[workerid].status = STATUS_WAKING_UP;
 		STARPU_PTHREAD_COND_SIGNAL(cond);
 		return 1;
 	}
@@ -1996,9 +1920,9 @@ int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, in
 			int s;
 			for(s = 1; s < STARPU_NMAX_SCHED_CTXS; s++)
 			{
-				if(config.sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+				if(_starpu_config.sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
 				{
-					struct starpu_worker_collection *workers = config.sched_ctxs[s].workers;
+					struct starpu_worker_collection *workers = _starpu_config.sched_ctxs[s].workers;
 					struct starpu_sched_ctx_iterator it;
 
 					workers->init_iterator(workers, &it);
@@ -2024,16 +1948,6 @@ int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, in
 }
 
 
-struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void)
-{
-	return &config.sched_ctxs[STARPU_GLOBAL_SCHED_CTX];
-}
-
-int _starpu_worker_get_nsched_ctxs(int workerid)
-{
-	return config.workers[workerid].nsched_ctxs;
-}
-
 int
 starpu_driver_run(struct starpu_driver *d)
 {
@@ -2174,18 +2088,6 @@ void _starpu_relock_mutex_if_prev_locked()
 	return;
 }
 
-void _starpu_worker_set_flag_sched_mutex_locked(int workerid, unsigned flag)
-{
-	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
-	w->sched_mutex_locked = flag;
-}
-
-unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex)
-{
-	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
-	return &w->sched_mutex == mutex;
-}
-
 unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 {
 	unsigned s = 0;

+ 111 - 16
src/core/workers.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
  * Copyright (C) 2011  INRIA
  *
@@ -334,6 +334,8 @@ struct _starpu_machine_config
 	/* this flag is set until the runtime is stopped */
 	unsigned running;
 
+	int disable_kernels;
+
 	/* Number of calls to starpu_pause() - calls to starpu_resume(). When >0,
 	 * StarPU should pause. */
 	int pause_depth;
@@ -349,6 +351,11 @@ struct _starpu_machine_config
 	starpu_pthread_mutex_t submitted_mutex;
 };
 
+extern struct _starpu_machine_config _starpu_config STARPU_ATTRIBUTE_INTERNAL;
+extern int _starpu_keys_initialized STARPU_ATTRIBUTE_INTERNAL;
+extern starpu_pthread_key_t _starpu_worker_key STARPU_ATTRIBUTE_INTERNAL;
+extern starpu_pthread_key_t _starpu_worker_set_key STARPU_ATTRIBUTE_INTERNAL;
+
 /* Three functions to manage argv, argc */
 void _starpu_set_argc_argv(int *argc, char ***argv);
 int *_starpu_get_argc();
@@ -361,7 +368,18 @@ void _starpu_conf_check_environment(struct starpu_conf *conf);
 void _starpu_may_pause(void);
 
 /* Has starpu_shutdown already been called ? */
-unsigned _starpu_machine_is_running(void);
+static inline unsigned _starpu_machine_is_running(void)
+{
+	unsigned ret;
+	/* running is just protected by a memory barrier */
+	STARPU_RMB();
+
+	ANNOTATE_HAPPENS_AFTER(&_starpu_config.running);
+	ret = _starpu_config.running;
+	ANNOTATE_HAPPENS_BEFORE(&_starpu_config.running);
+	return ret;
+}
+
 
 /* Check if there is a worker that may execute the task. */
 uint32_t _starpu_worker_exists(struct starpu_task *);
@@ -395,28 +413,53 @@ void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 /* The _starpu_worker structure describes all the state of a StarPU worker.
  * This function sets the pthread key which stores a pointer to this structure.
  * */
-void _starpu_set_local_worker_key(struct _starpu_worker *worker);
+static inline void _starpu_set_local_worker_key(struct _starpu_worker *worker)
+{
+	STARPU_ASSERT(_starpu_keys_initialized);
+	STARPU_PTHREAD_SETSPECIFIC(_starpu_worker_key, worker);
+}
 
 /* Returns the _starpu_worker structure that describes the state of the
  * current worker. */
-struct _starpu_worker *_starpu_get_local_worker_key(void);
+static inline struct _starpu_worker *_starpu_get_local_worker_key(void)
+{
+	if (!_starpu_keys_initialized)
+		return NULL;
+	return (struct _starpu_worker *) STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_key);
+}
 
 /* The _starpu_worker_set structure describes all the state of a StarPU worker_set.
  * This function sets the pthread key which stores a pointer to this structure.
  * */
-void _starpu_set_local_worker_set_key(struct _starpu_worker_set *worker_set);
+static inline void _starpu_set_local_worker_set_key(struct _starpu_worker_set *worker)
+{
+	STARPU_ASSERT(_starpu_keys_initialized);
+	STARPU_PTHREAD_SETSPECIFIC(_starpu_worker_set_key, worker);
+}
 
 /* Returns the _starpu_worker_set structure that describes the state of the
  * current worker_set. */
-struct _starpu_worker_set *_starpu_get_local_worker_set_key(void);
+static inline struct _starpu_worker_set *_starpu_get_local_worker_set_key(void)
+{
+	if (!_starpu_keys_initialized)
+		return NULL;
+	return (struct _starpu_worker_set *) STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_set_key);
+}
 
 /* Returns the _starpu_worker structure that describes the state of the
  * specified worker. */
-struct _starpu_worker *_starpu_get_worker_struct(unsigned id);
+static inline struct _starpu_worker *_starpu_get_worker_struct(unsigned id)
+{
+	return &_starpu_config.workers[id];
+}
 
 /* Returns the starpu_sched_ctx structure that descriebes the state of the 
  * specified ctx */
-struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
+static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
+{
+	if(id == STARPU_NMAX_SCHED_CTXS) return NULL;
+	return &_starpu_config.sched_ctxs[id];
+}
 
 struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id);
 
@@ -424,20 +467,35 @@ int _starpu_is_initialized(void);
 
 /* Returns the structure that describes the overall machine configuration (eg.
  * all workers and topology). */
-struct _starpu_machine_config *_starpu_get_machine_config(void);
+static inline struct _starpu_machine_config *_starpu_get_machine_config(void)
+{
+	return &_starpu_config;
+}
 
 /* Return whether kernels should be run (<=0) or not (>0) */
-int _starpu_get_disable_kernels(void);
+static inline int _starpu_get_disable_kernels(void)
+{
+	return _starpu_config.disable_kernels;
+}
 
 /* Retrieve the status which indicates what the worker is currently doing. */
-enum _starpu_worker_status _starpu_worker_get_status(int workerid);
+static inline enum _starpu_worker_status _starpu_worker_get_status(int workerid)
+{
+	return _starpu_config.workers[workerid].status;
+}
 
 /* Change the status of the worker which indicates what the worker is currently
  * doing (eg. executing a callback). */
-void _starpu_worker_set_status(int workerid, enum _starpu_worker_status status);
+static inline void _starpu_worker_set_status(int workerid, enum _starpu_worker_status status)
+{
+	_starpu_config.workers[workerid].status = status;
+}
 
 /* We keep an initial sched ctx which might be used in case no other ctx is available */
-struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void);
+static inline struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void)
+{
+	return &_starpu_config.sched_ctxs[STARPU_GLOBAL_SCHED_CTX];
+}
 
 int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 
@@ -451,10 +509,47 @@ void _starpu_unlock_mutex_if_prev_locked();
 /* if we prev released the lock relock it */
 void _starpu_relock_mutex_if_prev_locked();
 
-void _starpu_worker_set_flag_sched_mutex_locked(int workerid, unsigned flag);
+static inline void _starpu_worker_set_flag_sched_mutex_locked(int workerid, unsigned flag)
+{
+	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+	w->sched_mutex_locked = flag;
+}
+
+static inline unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex)
+{
+	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+	return &w->sched_mutex == mutex;
+}
+
+static inline int _starpu_worker_get_nsched_ctxs(int workerid)
+{
+	return _starpu_config.workers[workerid].nsched_ctxs;
+}
 
-unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex);
+/* Get the total number of sched_ctxs created till now */
+static inline unsigned _starpu_get_nsched_ctxs(void)
+{
+	return _starpu_config.topology.nsched_ctxs;
+}
+
+/* Inlined version when building the core.  */
+static inline int _starpu_worker_get_id(void)
+{
+	struct _starpu_worker * worker;
+
+	worker = _starpu_get_local_worker_key();
+	if (worker)
+	{
+		return worker->workerid;
+	}
+	else
+	{
+		/* there is no worker associated to that thread, perhaps it is
+		 * a thread from the application or this is some SPU worker */
+		return -1;
+	}
+}
+#define starpu_worker_get_id _starpu_worker_get_id
 
-int _starpu_worker_get_nsched_ctxs(int workerid);
 
 #endif // __WORKERS_H__

+ 4 - 22
src/datawizard/memory_nodes.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,7 +26,7 @@
 #include "memalloc.h"
 
 static struct _starpu_memory_node_descr descr;
-static starpu_pthread_key_t memory_node_key;
+starpu_pthread_key_t _starpu_memory_node_key STARPU_ATTRIBUTE_INTERNAL;
 
 void _starpu_memory_nodes_init(void)
 {
@@ -34,7 +34,7 @@ void _starpu_memory_nodes_init(void)
 	 * added using _starpu_memory_node_register */
 	descr.nnodes = 0;
 
-	STARPU_PTHREAD_KEY_CREATE(&memory_node_key, NULL);
+	STARPU_PTHREAD_KEY_CREATE(&_starpu_memory_node_key, NULL);
 
 	unsigned i;
 	for (i = 0; i < STARPU_MAXNODES; i++)
@@ -57,25 +57,7 @@ void _starpu_memory_nodes_deinit(void)
 	_starpu_deinit_mem_chunk_lists();
 
 	STARPU_PTHREAD_RWLOCK_DESTROY(&descr.conditions_rwlock);
-	STARPU_PTHREAD_KEY_DELETE(memory_node_key);
-}
-
-void _starpu_memory_node_set_local_key(unsigned *node)
-{
-	STARPU_PTHREAD_SETSPECIFIC(memory_node_key, node);
-}
-
-unsigned _starpu_memory_node_get_local_key(void)
-{
-	unsigned *memory_node;
-	memory_node = (unsigned *) STARPU_PTHREAD_GETSPECIFIC(memory_node_key);
-
-	/* in case this is called by the programmer, we assume the RAM node
-	   is the appropriate memory node ... XXX */
-	if (STARPU_UNLIKELY(!memory_node))
-		return STARPU_MAIN_RAM;
-
-	return *memory_node;
+	STARPU_PTHREAD_KEY_DELETE(_starpu_memory_node_key);
 }
 
 void _starpu_memory_node_add_nworkers(unsigned node)

+ 20 - 3
src/datawizard/memory_nodes.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2009-2012, 2014-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2013  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -69,8 +69,25 @@ struct _starpu_memory_node_descr
 
 void _starpu_memory_nodes_init(void);
 void _starpu_memory_nodes_deinit(void);
-void _starpu_memory_node_set_local_key(unsigned *node);
-unsigned _starpu_memory_node_get_local_key(void);
+extern starpu_pthread_key_t _starpu_memory_node_key STARPU_ATTRIBUTE_INTERNAL;
+static inline void _starpu_memory_node_set_local_key(unsigned *node)
+{
+	STARPU_PTHREAD_SETSPECIFIC(_starpu_memory_node_key, node);
+}
+
+static inline unsigned _starpu_memory_node_get_local_key(void)
+{
+	unsigned *memory_node;
+	memory_node = (unsigned *) STARPU_PTHREAD_GETSPECIFIC(_starpu_memory_node_key);
+
+	/* in case this is called by the programmer, we assume the RAM node
+	   is the appropriate memory node ... XXX */
+	if (STARPU_UNLIKELY(!memory_node))
+		return STARPU_MAIN_RAM;
+
+	return *memory_node;
+}
+
 void _starpu_memory_node_add_nworkers(unsigned node);
 unsigned _starpu_memory_node_get_nworkers(unsigned node);
 #ifdef STARPU_SIMGRID

+ 1 - 1
src/worker_collection/worker_list.c

@@ -268,7 +268,7 @@ static void list_init_iterator(struct starpu_worker_collection *workers, struct
 static void list_init_iterator_for_parallel_tasks(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it, struct starpu_task *task)
 {
 	list_init_iterator(workers, it);
-	if (_starpu_get_nsched_ctxs() == 1)
+	if (_starpu_get_nsched_ctxs() <= 1)
 		return;
 
 	it->possibly_parallel = task->possibly_parallel; /* 0/1 => this field indicates if we consider masters only or slaves not blocked too */

+ 1 - 1
src/worker_collection/worker_tree.c

@@ -312,7 +312,7 @@ static void tree_init_iterator(struct starpu_worker_collection *workers, struct
 
 static void tree_init_iterator_for_parallel_tasks(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it, struct starpu_task *task)
 {
-	if (_starpu_get_nsched_ctxs() == 1)
+	if (_starpu_get_nsched_ctxs() <= 1)
 	{
 		tree_init_iterator(workers, it);
 		return;