浏览代码

Do not reimplement spinlocks, use pthread_spin_t instead.

Cédric Augonnet 16 年之前
父节点
当前提交
ec8baec6b1

+ 0 - 1
Makefile.am

@@ -26,7 +26,6 @@ include_HEADERS = 				\
 	include/starpu_config.h			\
 	include/starpu-data-filters.h		\
 	include/starpu-data-interfaces.h	\
-	include/starpu-mutex.h			\
 	include/starpu-task.h			\
 	include/starpu-data.h			\
 	include/starpu-perfmodel.h		\

+ 0 - 33
include/starpu-mutex.h

@@ -1,33 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __STARPU_MUTEX_H__
-#define __STARPU_MUTEX_H__
-
-#include <starpu_config.h>
-#include <stdint.h>
-
-typedef struct starpu_mutex_t {
-	/* we only have a trivial implementation yet ! */
-	volatile uint32_t taken __attribute__ ((aligned(16)));
-} starpu_mutex;
-
-void init_mutex(starpu_mutex *m);
-void take_mutex(starpu_mutex *m);
-int take_mutex_try(starpu_mutex *m);
-void release_mutex(starpu_mutex *m);
-
-#endif // __STARPU_MUTEX_H__

+ 2 - 2
include/starpu-perfmodel.h

@@ -18,8 +18,8 @@
 #define __STARPU_PERFMODEL_H__
 
 #include <stdio.h>
+#include <pthread.h>
 #include <starpu_config.h>
-#include <starpu-mutex.h>
 
 struct starpu_htbl32_node_s;
 struct starpu_history_list_t;
@@ -89,7 +89,7 @@ struct starpu_perfmodel_t {
 	unsigned is_loaded;
 	unsigned benchmarking;
 
-	starpu_mutex model_mutex;
+	pthread_spinlock_t model_mutex;
 };
 
 #endif // __STARPU_PERFMODEL_H__

+ 0 - 1
src/Makefile.am

@@ -84,7 +84,6 @@ libstarpu_la_SOURCES = 						\
 	common/malloc.c						\
 	common/hash.c 						\
 	common/htable32.c					\
-	common/mutex.c						\
 	common/rwlock.c						\
 	common/timing.c						\
 	core/jobs.c						\

+ 0 - 43
src/common/mutex.c

@@ -1,43 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu-mutex.h>
-
-void init_mutex(starpu_mutex *m)
-{
-	/* this is free at first */
-	m->taken = 0;
-}
-
-inline int take_mutex_try(starpu_mutex *m)
-{
-	uint32_t prev;
-	prev = __sync_lock_test_and_set(&m->taken, 1);
-	return (prev == 0)?0:-1;
-}
-
-inline void take_mutex(starpu_mutex *m)
-{
-	uint32_t prev;
-	do {
-		prev = __sync_lock_test_and_set(&m->taken, 1);
-	} while (prev);
-}
-
-inline void release_mutex(starpu_mutex *m)
-{
-	m->taken = 0;
-}

+ 8 - 8
src/core/dependencies/data-concurrency.c

@@ -63,7 +63,7 @@ unsigned attempt_to_submit_data_request_from_apps(data_state *data, starpu_acces
 {
 	unsigned ret;
 
-	take_mutex(&data->header_lock);
+	pthread_spin_lock(&data->header_lock);
 
 	if (data->refcnt == 0)
 	{
@@ -103,7 +103,7 @@ unsigned attempt_to_submit_data_request_from_apps(data_state *data, starpu_acces
 		}
 	}
 
-	release_mutex(&data->header_lock);
+	pthread_spin_unlock(&data->header_lock);
 	return ret;
 }
 
@@ -114,7 +114,7 @@ static unsigned attempt_to_submit_data_request_from_job(job_t j, unsigned buffer
 	data_state *data = j->task->buffers[buffer_index].handle;
 	starpu_access_mode mode = j->task->buffers[buffer_index].mode;
 
-	take_mutex(&data->header_lock);
+	pthread_spin_lock(&data->header_lock);
 
 	if (data->refcnt == 0)
 	{
@@ -154,7 +154,7 @@ static unsigned attempt_to_submit_data_request_from_job(job_t j, unsigned buffer
 		}
 	}
 
-	release_mutex(&data->header_lock);
+	pthread_spin_unlock(&data->header_lock);
 	return ret;
 }
 
@@ -189,7 +189,7 @@ unsigned submit_job_enforce_data_deps(job_t j)
 
 void notify_data_dependencies(data_state *data)
 {
-	take_mutex(&data->header_lock);
+	pthread_spin_lock(&data->header_lock);
 
 	data->refcnt--;
 
@@ -200,7 +200,7 @@ void notify_data_dependencies(data_state *data)
 
 		data->refcnt++;
 	
-		release_mutex(&data->header_lock);
+		pthread_spin_unlock(&data->header_lock);
 
 		if (r->is_requested_by_codelet)
 		{
@@ -217,10 +217,10 @@ void notify_data_dependencies(data_state *data)
 
 		data_requester_delete(r);
 		
-		take_mutex(&data->header_lock);
+		pthread_spin_lock(&data->header_lock);
 	}
 	
-	release_mutex(&data->header_lock);
+	pthread_spin_unlock(&data->header_lock);
 
 }
 

+ 33 - 30
src/core/dependencies/tags.c

@@ -24,9 +24,12 @@
 #include <starpu.h>
 
 static htbl_node_t *tag_htbl = NULL;
-static starpu_mutex tag_mutex = {
-	.taken = 0
-};
+pthread_spinlock_t tag_mutex;
+
+void initialize_tag_mutex(void)
+{
+	pthread_spin_init(&tag_mutex, 0);
+}
 
 static cg_t *create_cg(unsigned ntags, struct tag_s *tag, unsigned is_apps_cg)
 {
@@ -76,7 +79,7 @@ static struct tag_s *tag_init(starpu_tag_t id)
 	tag->succ = realloc(NULL, tag->succ_list_size*sizeof(struct _cg_t *));
 #endif
 
-	init_mutex(&tag->lock);
+	pthread_spin_init(&tag->lock, 0);
 
 	return tag;
 }
@@ -85,27 +88,27 @@ void starpu_tag_remove(starpu_tag_t id)
 {
 	struct tag_s *tag;
 
-	take_mutex(&tag_mutex);
+	pthread_spin_lock(&tag_mutex);
 
 	tag = htbl_remove_tag(tag_htbl, id);
 
-	release_mutex(&tag_mutex);
+	pthread_spin_unlock(&tag_mutex);
 
-	take_mutex(&tag->lock);
+	pthread_spin_lock(&tag->lock);
 	
 #ifdef DYNAMIC_DEPS_SIZE
 	if (tag)
 		free(tag->succ);
 #endif
 
-	release_mutex(&tag->lock);
+	pthread_spin_unlock(&tag->lock);
 
 	free(tag);
 }
 
 static struct tag_s *gettag_struct(starpu_tag_t id)
 {
-	take_mutex(&tag_mutex);
+	pthread_spin_lock(&tag_mutex);
 
 	/* search if the tag is already declared or not */
 	struct tag_s *tag;
@@ -121,7 +124,7 @@ static struct tag_s *gettag_struct(starpu_tag_t id)
 		STARPU_ASSERT(old == NULL);
 	}
 
-	release_mutex(&tag_mutex);
+	pthread_spin_unlock(&tag_mutex);
 
 	return tag;
 }
@@ -129,14 +132,14 @@ static struct tag_s *gettag_struct(starpu_tag_t id)
 /* lock should be taken */
 static void tag_set_ready(struct tag_s *tag)
 {
-//	take_mutex(&tag->lock);
+//	pthread_spin_lock(&tag->lock);
 
 	/* mark this tag as ready to run */
 	tag->state = READY;
 	/* declare it to the scheduler ! */
 	struct job_s *j = tag->job;
 
-//	release_mutex(&tag->lock);
+//	pthread_spin_unlock(&tag->lock);
 
 #ifdef NO_DATA_RW_LOCK
 	/* enforce data dependencies */
@@ -165,14 +168,14 @@ static void notify_cg(cg_t *cg)
 		}
 		else
 		{
-//			take_mutex(&cg->tag->lock);
+//			pthread_spin_lock(&cg->tag->lock);
 			struct tag_s *tag = cg->tag;
 			tag->ndeps_completed++;
 
 			if ((tag->state == BLOCKED) 
 				&& (tag->ndeps == tag->ndeps_completed))
 				tag_set_ready(cg->tag);
-//			release_mutex(&cg->tag->lock);
+//			pthread_spin_unlock(&cg->tag->lock);
 
 			free(cg);
 		}
@@ -208,7 +211,7 @@ static void tag_add_succ(struct tag_s *tag, cg_t *cg)
 		tag->succ[index] = cg;
 	}
 
-	release_mutex(&tag->lock);
+	pthread_spin_unlock(&tag->lock);
 }
 
 void notify_dependencies(struct job_s *j)
@@ -223,7 +226,7 @@ void notify_dependencies(struct job_s *j)
 		/* in case there are dependencies, wake up the proper tasks */
 		tag = j->tag;
 
-		take_mutex(&tag->lock);
+		pthread_spin_lock(&tag->lock);
 
 		tag->state = DONE;
 		TRACE_TASK_DONE(tag->id);
@@ -237,15 +240,15 @@ void notify_dependencies(struct job_s *j)
 			struct tag_s *cgtag = cg->tag;
 
 			if (!used_by_apps)
-				take_mutex(&cgtag->lock);
+				pthread_spin_lock(&cgtag->lock);
 
 			notify_cg(cg);
 
 			if (!used_by_apps)
-				release_mutex(&cgtag->lock);
+				pthread_spin_unlock(&cgtag->lock);
 		}
 
-		release_mutex(&tag->lock);
+		pthread_spin_unlock(&tag->lock);
 	}
 }
 
@@ -271,7 +274,7 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 	/* create the associated completion group */
 	struct tag_s *tag_child = gettag_struct(id);
 
-	take_mutex(&tag_child->lock);
+	pthread_spin_lock(&tag_child->lock);
 
 	cg_t *cg = create_cg(ndeps, tag_child, 0);
 
@@ -285,12 +288,12 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 		 * so cg should be among dep_id's successors*/
 		TRACE_CODELET_TAG_DEPS(id, dep_id);
 		struct tag_s *tag_dep = gettag_struct(dep_id);
-		take_mutex(&tag_dep->lock);
+		pthread_spin_lock(&tag_dep->lock);
 		tag_add_succ(tag_dep, cg);
-		release_mutex(&tag_dep->lock);
+		pthread_spin_unlock(&tag_dep->lock);
 	}
 
-	release_mutex(&tag_child->lock);
+	pthread_spin_unlock(&tag_child->lock);
 }
 
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
@@ -300,7 +303,7 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 	/* create the associated completion group */
 	struct tag_s *tag_child = gettag_struct(id);
 
-	take_mutex(&tag_child->lock);
+	pthread_spin_lock(&tag_child->lock);
 
 	cg_t *cg = create_cg(ndeps, tag_child, 0);
 
@@ -318,13 +321,13 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 		 * so cg should be among dep_id's successors*/
 		TRACE_CODELET_TAG_DEPS(id, dep_id);
 		struct tag_s *tag_dep = gettag_struct(dep_id);
-		take_mutex(&tag_dep->lock);
+		pthread_spin_lock(&tag_dep->lock);
 		tag_add_succ(tag_dep, cg);
-		release_mutex(&tag_dep->lock);
+		pthread_spin_unlock(&tag_dep->lock);
 	}
 	va_end(pa);
 
-	release_mutex(&tag_child->lock);
+	pthread_spin_unlock(&tag_child->lock);
 }
 
 /* this function may be called by the application (outside callbacks !) */
@@ -340,12 +343,12 @@ void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 	{
 		struct tag_s *tag = gettag_struct(id[i]);
 		
-		take_mutex(&tag->lock);
+		pthread_spin_lock(&tag->lock);
 
 		if (tag->state == DONE)
 		{
 			/* that tag is done already */
-			release_mutex(&tag->lock);
+			pthread_spin_unlock(&tag->lock);
 		}
 		else
 		{
@@ -366,7 +369,7 @@ void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 	for (i = 0; i < current; i++)
 	{
 		tag_add_succ(tag_array[i], cg);
-		release_mutex(&tag_array[i]->lock);
+		pthread_spin_unlock(&tag_array[i]->lock);
 	}
 
 	pthread_mutex_lock(&cg->cg_mutex);

+ 4 - 2
src/core/dependencies/tags.h

@@ -18,7 +18,7 @@
 #define __TAGS_H__
 
 #include <stdint.h>
-#include <starpu-mutex.h>
+#include <pthread.h>
 #include <core/jobs.h>
 
 /* we do not necessarily want to allocate room for 256 dependencies, but we
@@ -53,7 +53,7 @@ typedef enum {
 struct job_s;
 
 struct tag_s {
-	starpu_mutex lock;
+	pthread_spinlock_t lock;
 	starpu_tag_t id; /* an identifier for the task */
 	tag_state state;
 	unsigned nsuccs; /* how many successors ? */
@@ -85,6 +85,8 @@ typedef struct _cg_t {
 	pthread_cond_t cg_cond;
 } cg_t;
 
+void initialize_tag_mutex(void);
+
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
 
 void notify_dependencies(struct job_s *j);

+ 2 - 2
src/core/jobs.c

@@ -165,7 +165,7 @@ static unsigned not_all_task_deps_are_fulfilled(job_t j)
 
 	struct tag_s *tag = j->tag;
 
-	take_mutex(&tag->lock);
+	pthread_spin_lock(&tag->lock);
 
 	if (tag->ndeps != tag->ndeps_completed)
 	{
@@ -178,7 +178,7 @@ static unsigned not_all_task_deps_are_fulfilled(job_t j)
 		ret = 0;
 	}
 
-	release_mutex(&tag->lock);
+	pthread_spin_unlock(&tag->lock);
 	return ret;
 }
 

+ 1 - 1
src/core/perfmodel/perfmodel.h

@@ -22,7 +22,7 @@
 //#include <core/jobs.h>
 #include <common/htable32.h>
 //#include <core/workers.h>
-#include <starpu-mutex.h>
+#include <pthread.h>
 #include <stdio.h>
 
 struct starpu_buffer_descr_t;

+ 9 - 9
src/core/perfmodel/perfmodel_history.c

@@ -22,7 +22,7 @@
 #include <core/perfmodel/perfmodel.h>
 #include <core/jobs.h>
 #include <core/workers.h>
-#include <starpu-mutex.h>
+#include <pthread.h>
 #include <datawizard/datawizard.h>
 #include <core/perfmodel/regression.h>
 #include <common/config.h>
@@ -307,7 +307,7 @@ void load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_hi
 
 	/* XXX we assume the lock is implicitely initialized (taken = 0) */
 	//init_mutex(&model->model_mutex);
-	take_mutex(&model->model_mutex);
+	pthread_spin_lock(&model->model_mutex);
 
 	/* perhaps some other thread got in before ... */
 	if (!model->is_loaded)
@@ -364,7 +364,7 @@ void load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_hi
 		model->is_loaded = 1;
 	}
 
-	release_mutex(&model->model_mutex);
+	pthread_spin_unlock(&model->model_mutex);
 }
 
 double regression_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j)
@@ -405,9 +405,9 @@ double history_based_job_expected_length(struct starpu_perfmodel_t *model, enum
 	if (!history)
 		return -1.0;
 
-	take_mutex(&model->model_mutex);
+	pthread_spin_lock(&model->model_mutex);
 	entry = htbl_search_32(history, key);
-	release_mutex(&model->model_mutex);
+	pthread_spin_unlock(&model->model_mutex);
 
 	exp = entry?entry->mean:-1.0;
 
@@ -439,7 +439,7 @@ void update_perfmodel_history(job_t j, enum starpu_perf_archtype arch, double me
 			reg_model = &per_arch_model->regression;
 			list = &per_arch_model->list;
 
-			take_mutex(&model->model_mutex);
+			pthread_spin_lock(&model->model_mutex);
 	
 				entry = htbl_search_32(history, key);
 	
@@ -494,13 +494,13 @@ void update_perfmodel_history(job_t j, enum starpu_perf_archtype arch, double me
 			reg_model->beta = num/denom;
 			reg_model->alpha = expl((reg_model->sumlny - reg_model->beta*reg_model->sumlnx)/n);
 			
-			release_mutex(&model->model_mutex);
+			pthread_spin_unlock(&model->model_mutex);
 		}
 
 #ifdef MODEL_DEBUG
 		FILE * debug_file = per_arch_model->debug_file;
 
-		take_mutex(&model->model_mutex);
+		pthread_spin_lock(&model->model_mutex);
 
 		STARPU_ASSERT(j->footprint_is_computed);
 
@@ -519,7 +519,7 @@ void update_perfmodel_history(job_t j, enum starpu_perf_archtype arch, double me
 		fprintf(debug_file, "\n");	
 
 
-		release_mutex(&model->model_mutex);
+		pthread_spin_unlock(&model->model_mutex);
 #endif
 	}
 }

+ 1 - 1
src/core/policies/sched_policy.c

@@ -121,7 +121,7 @@ void init_sched_policy(struct machine_config_s *config, struct starpu_conf *user
 	pthread_cond_init(&policy.sched_activity_cond, NULL);
 	pthread_mutex_init(&policy.sched_activity_mutex, NULL);
 	pthread_key_create(&policy.local_queue_key, NULL);
-	init_mutex(&descr.attached_queues_mutex);
+	pthread_spin_init(&descr.attached_queues_mutex, 0);
 	descr.total_queues_count = 0;
 
 	policy.init_sched(config, &policy);

+ 6 - 4
src/core/workers.c

@@ -369,6 +369,8 @@ void starpu_init(struct starpu_conf *user_conf)
 
 	init_workers_binding(&config);
 
+	initialize_tag_mutex();
+
 	/* initialize the scheduler */
 
 	/* initialize the queue containing the jobs */
@@ -445,7 +447,7 @@ static void operate_on_all_queues_attached_to_node(unsigned nodeid, queue_op op)
 	unsigned q_id;
 	struct jobq_s *q;
 
-	take_mutex(&descr.attached_queues_mutex);
+	pthread_spin_lock(&descr.attached_queues_mutex);
 
 	unsigned nqueues = descr.queues_count[nodeid];
 
@@ -465,7 +467,7 @@ static void operate_on_all_queues_attached_to_node(unsigned nodeid, queue_op op)
 		}
 	}
 
-	release_mutex(&descr.attached_queues_mutex);
+	pthread_spin_unlock(&descr.attached_queues_mutex);
 }
 
 inline void lock_all_queues_attached_to_node(unsigned node)
@@ -488,7 +490,7 @@ static void operate_on_all_queues(queue_op op)
 	unsigned q_id;
 	struct jobq_s *q;
 
-	take_mutex(&descr.attached_queues_mutex);
+	pthread_spin_lock(&descr.attached_queues_mutex);
 
 	unsigned nqueues = descr.total_queues_count;
 
@@ -508,7 +510,7 @@ static void operate_on_all_queues(queue_op op)
 		}
 	}
 
-	release_mutex(&descr.attached_queues_mutex);
+	pthread_spin_unlock(&descr.attached_queues_mutex);
 }
 
 static void kill_all_workers(struct machine_config_s *config)

+ 19 - 19
src/datawizard/coherency.c

@@ -96,7 +96,7 @@ static void update_data_state(data_state *state, uint32_t requesting_node,
 int _fetch_data(data_state *state, uint32_t requesting_node,
 			uint8_t read, uint8_t write)
 {
-	while (take_mutex_try(&state->header_lock)) {
+	while (pthread_spin_trylock(&state->header_lock)) {
 		datawizard_progress(requesting_node);
 	}
 
@@ -107,7 +107,7 @@ int _fetch_data(data_state *state, uint32_t requesting_node,
 	if ((local_state == OWNER) || (local_state == SHARED && !write))
 	{
 		/* the local node already got its data */
-		release_mutex(&state->header_lock);
+		pthread_spin_unlock(&state->header_lock);
 		msi_cache_hit(requesting_node);
 		return 0;
 	}
@@ -126,7 +126,7 @@ int _fetch_data(data_state *state, uint32_t requesting_node,
 
 		}
 		
-		release_mutex(&state->header_lock);
+		pthread_spin_unlock(&state->header_lock);
 		msi_cache_hit(requesting_node);
 		return 0;
 	}
@@ -150,13 +150,13 @@ int _fetch_data(data_state *state, uint32_t requesting_node,
 
 	update_data_state(state, requesting_node, write);
 
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 
 	return 0;
 
 enomem:
 	/* there was not enough local memory to fetch the data */
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 	return -ENOMEM;
 }
 
@@ -181,11 +181,11 @@ static int fetch_data(data_state *state, starpu_access_mode mode)
 	}
 #endif
 
-	while (take_mutex_try(&state->header_lock))
+	while (pthread_spin_trylock(&state->header_lock))
 		datawizard_progress(requesting_node);
 
 	state->per_node[requesting_node].refcnt++;
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 
 	ret = _fetch_data(state, requesting_node, read, write);
 	if (ret != 0)
@@ -194,11 +194,11 @@ static int fetch_data(data_state *state, starpu_access_mode mode)
 	return 0;
 enomem:
 	/* we did not get the data so remove the lock anyway */
-	while (take_mutex_try(&state->header_lock))
+	while (pthread_spin_trylock(&state->header_lock))
 		datawizard_progress(requesting_node);
 
 	state->per_node[requesting_node].refcnt--;
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 
 #ifndef NO_DATA_RW_LOCK
 	release_rw_lock(&state->data_lock);
@@ -229,11 +229,11 @@ static void release_data(data_state *state, uint32_t default_wb_mask)
 		write_through_data(state, requesting_node, wb_mask);
 	}
 
-	while (take_mutex_try(&state->header_lock))
+	while (pthread_spin_trylock(&state->header_lock))
 		datawizard_progress(requesting_node);
 
 	state->per_node[requesting_node].refcnt--;
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 
 #ifndef NO_DATA_RW_LOCK
 	/* this is intended to make data accessible again */
@@ -294,7 +294,7 @@ void push_codelet_output(starpu_buffer_descr *descrs, unsigned nbuffers, uint32_
 
 int request_data_allocation(data_state *state, uint32_t node)
 {
-	take_mutex(&state->header_lock);
+	pthread_spin_lock(&state->header_lock);
 
 	int ret;
 	ret = allocate_per_node_buffer(state, node);
@@ -303,7 +303,7 @@ int request_data_allocation(data_state *state, uint32_t node)
 	/* XXX quick and dirty hack */
 	state->per_node[node].automatically_allocated = 0;	
 
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 
 	return 0;
 }
@@ -382,7 +382,7 @@ void starpu_sync_data_with_mem(data_state *state)
 
 static inline void do_notify_data_modification(data_state *state, uint32_t modifying_node)
 {
-	take_mutex(&state->header_lock);
+	pthread_spin_lock(&state->header_lock);
 
 	unsigned node = 0;
 	for (node = 0; node < MAXNODES; node++)
@@ -391,7 +391,7 @@ static inline void do_notify_data_modification(data_state *state, uint32_t modif
 			(node == modifying_node?OWNER:INVALID);
 	}
 
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 }
 
 #ifdef NO_DATA_RW_LOCK
@@ -453,13 +453,13 @@ unsigned is_data_present_or_requested(data_state *state, uint32_t node)
 	unsigned ret = 0;
 
 // XXX : this is just a hint, so we don't take the lock ...
-//	take_mutex(&state->header_lock);
+//	pthread_spin_lock(&state->header_lock);
 
 	if (state->per_node[node].state != INVALID 
 		|| state->per_node[node].requested)
 		ret = 1;
 
-//	release_mutex(&state->header_lock);
+//	pthread_spin_unlock(&state->header_lock);
 
 	return ret;
 }
@@ -467,10 +467,10 @@ unsigned is_data_present_or_requested(data_state *state, uint32_t node)
 inline void set_data_requested_flag_if_needed(data_state *state, uint32_t node)
 {
 // XXX : this is just a hint, so we don't take the lock ...
-//	take_mutex(&state->header_lock);
+//	pthread_spin_lock(&state->header_lock);
 
 	if (state->per_node[node].state == INVALID) 
 		state->per_node[node].requested = 1;
 
-//	release_mutex(&state->header_lock);
+//	pthread_spin_unlock(&state->header_lock);
 }

+ 2 - 2
src/datawizard/coherency.h

@@ -26,7 +26,7 @@
 
 #include <starpu.h>
 
-#include <starpu-mutex.h>
+#include <pthread.h>
 #include <common/rwlock.h>
 #include <common/timing.h>
 #include <common/fxt.h>
@@ -106,7 +106,7 @@ typedef struct starpu_data_state_t {
 	rw_lock	data_lock;
 #endif
 	/* protect meta data */
-	starpu_mutex header_lock;
+	pthread_spinlock_t header_lock;
 
 	uint32_t nnodes; /* the number of memory nodes that may use it */
 	struct starpu_data_state_t *children;

+ 2 - 2
src/datawizard/copy-driver.c

@@ -30,7 +30,7 @@ void wake_all_blocked_workers_on_node(unsigned nodeid)
 	/* wake up all queues on that node */
 	unsigned q_id;
 
-	take_mutex(&descr.attached_queues_mutex);
+	pthread_spin_lock(&descr.attached_queues_mutex);
 
 	unsigned nqueues = descr.queues_count[nodeid];
 	for (q_id = 0; q_id < nqueues; q_id++)
@@ -44,7 +44,7 @@ void wake_all_blocked_workers_on_node(unsigned nodeid)
 		pthread_mutex_unlock(&q->activity_mutex);
 	}
 
-	release_mutex(&descr.attached_queues_mutex);
+	pthread_spin_unlock(&descr.attached_queues_mutex);
 }
 
 void wake_all_blocked_workers(void)

+ 2 - 2
src/datawizard/data_request.c

@@ -70,7 +70,7 @@ int post_data_request(data_state *state, uint32_t src_node, uint32_t dst_node)
 	 * TODO: handle the situation of a possible invalidation caused by
 	 * memory eviction mechanism. This could be done by the means of a
 	 * specific state (or flag) in the MSI protocol. */
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 #endif
 
 //	/* wait for the request to be performed */
@@ -96,7 +96,7 @@ int post_data_request(data_state *state, uint32_t src_node, uint32_t dst_node)
 	pthread_mutex_unlock(&data_requests_list_mutex[src_node]);
 
 #ifdef NO_DATA_RW_LOCK
-	take_mutex(&state->header_lock);
+	pthread_spin_lock(&state->header_lock);
 #endif
 
 	retvalue = r->retval;

+ 10 - 10
src/datawizard/hierarchy.c

@@ -51,10 +51,10 @@ void register_new_data(data_state *state, uint32_t home_node, uint32_t wb_mask)
 	state->req_list = data_requester_list_new();
 	state->refcnt = 0;
 #endif
-	init_mutex(&state->header_lock);
+	pthread_spin_init(&state->header_lock, 0);
 
 	/* first take care to properly lock the data */
-	take_mutex(&state->header_lock);
+	pthread_spin_lock(&state->header_lock);
 
 	/* we assume that all nodes may use that data */
 	state->nnodes = MAXNODES;
@@ -90,7 +90,7 @@ void register_new_data(data_state *state, uint32_t home_node, uint32_t wb_mask)
 	}
 
 	/* now the data is available ! */
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 }
 
 /*
@@ -167,7 +167,7 @@ void starpu_partition_data(data_state *initial_data, starpu_filter *f)
 	int i;
 
 	/* first take care to properly lock the data header */
-	take_mutex(&initial_data->header_lock);
+	pthread_spin_lock(&initial_data->header_lock);
 
 	/* there should not be mutiple filters applied on the same data */
 	STARPU_ASSERT(initial_data->nchildren == 0);
@@ -202,7 +202,7 @@ void starpu_partition_data(data_state *initial_data, starpu_filter *f)
 		children->req_list = data_requester_list_new();
 		children->refcnt = 0;
 #endif
-		init_mutex(&children->header_lock);
+		pthread_spin_init(&children->header_lock, 0);
 
 		unsigned node;
 		for (node = 0; node < MAXNODES; node++)
@@ -217,7 +217,7 @@ void starpu_partition_data(data_state *initial_data, starpu_filter *f)
 	}
 
 	/* now let the header */
-	release_mutex(&initial_data->header_lock);
+	pthread_spin_unlock(&initial_data->header_lock);
 }
 
 void starpu_unpartition_data(data_state *root_data, uint32_t gathering_node)
@@ -225,7 +225,7 @@ void starpu_unpartition_data(data_state *root_data, uint32_t gathering_node)
 	int child;
 	unsigned node;
 
-	take_mutex(&root_data->header_lock);
+	pthread_spin_lock(&root_data->header_lock);
 
 #ifdef NO_DATA_RW_LOCK
 #warning starpu_unpartition_data is not supported with NO_DATA_RW_LOCK yet ...
@@ -302,13 +302,13 @@ void starpu_unpartition_data(data_state *root_data, uint32_t gathering_node)
 	root_data->nchildren = 0;
 
 	/* now the parent may be used again so we release the lock */
-	release_mutex(&root_data->header_lock);
+	pthread_spin_unlock(&root_data->header_lock);
 }
 
 void starpu_advise_if_data_is_important(data_state *state, unsigned is_important)
 {
 
-	take_mutex(&state->header_lock);
+	pthread_spin_lock(&state->header_lock);
 
 	/* first take all the children lock (in order !) */
 	int child;
@@ -322,6 +322,6 @@ void starpu_advise_if_data_is_important(data_state *state, unsigned is_important
 	state->is_not_important = !is_important;
 
 	/* now the parent may be used again so we release the lock */
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 
 }

+ 15 - 15
src/datawizard/memalloc.c

@@ -18,7 +18,7 @@
 #include <datawizard/footprint.h>
 
 extern mem_node_descr descr;
-static starpu_mutex mc_mutex[MAXNODES]; 
+pthread_spinlock_t mc_mutex[MAXNODES]; 
 static mem_chunk_list_t mc_list[MAXNODES];
 static mem_chunk_list_t mc_list_to_free[MAXNODES];
 
@@ -29,7 +29,7 @@ void init_mem_chunk_lists(void)
 	unsigned i;
 	for (i = 0; i < MAXNODES; i++)
 	{
-		init_mutex(&mc_mutex[i]);
+		pthread_spin_init(&mc_mutex[i], 0);
 		mc_list[i] = mem_chunk_list_new();
 		mc_list_to_free[i] = mem_chunk_list_new();
 	}
@@ -50,7 +50,7 @@ static void lock_all_subtree(data_state *data)
 	if (data->nchildren == 0)
 	{
 		/* this is a leaf */	
-		while (take_mutex_try(&data->header_lock))
+		while (pthread_spin_trylock(&data->header_lock))
 			datawizard_progress(get_local_memory_node());
 	}
 	else {
@@ -68,7 +68,7 @@ static void unlock_all_subtree(data_state *data)
 	if (data->nchildren == 0)
 	{
 		/* this is a leaf */	
-		release_mutex(&data->header_lock);
+		pthread_spin_unlock(&data->header_lock);
 	}
 	else {
 		/* lock all sub-subtrees children */
@@ -291,7 +291,7 @@ static unsigned try_to_reuse_mem_chunk(mem_chunk_t mc, unsigned node, data_state
  * list of mem chunk that need to be liberated */
 static unsigned try_to_find_reusable_mem_chunk(unsigned node, data_state *data, uint32_t footprint)
 {
-	take_mutex(&mc_mutex[node]);
+	pthread_spin_lock(&mc_mutex[node]);
 
 	/* go through all buffers for which there was a removal request */
 	mem_chunk_t mc, next_mc;
@@ -312,7 +312,7 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, data_state *data,
 			{
 				reuse_mem_chunk(node, data, mc, 0);
 
-				release_mutex(&mc_mutex[node]);
+				pthread_spin_unlock(&mc_mutex[node]);
 				return 1;
 			}
 		}
@@ -334,13 +334,13 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, data_state *data,
 //			fprintf(stderr, "found a candidate ...\n");
 			if (try_to_reuse_mem_chunk(mc, node, data, 1))
 			{
-				release_mutex(&mc_mutex[node]);
+				pthread_spin_unlock(&mc_mutex[node]);
 				return 1;
 			}
 		}
 	}
 
-	release_mutex(&mc_mutex[node]);
+	pthread_spin_unlock(&mc_mutex[node]);
 
 	return 0;
 }
@@ -356,7 +356,7 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 
 	size_t liberated = 0;
 
-	take_mutex(&mc_mutex[node]);
+	pthread_spin_lock(&mc_mutex[node]);
 
 	/* remove all buffers for which there was a removal request */
 	mem_chunk_t mc, next_mc;
@@ -392,7 +392,7 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 
 //	fprintf(stderr, "got %d MB back\n", (int)liberated/(1024*1024));
 
-	release_mutex(&mc_mutex[node]);
+	pthread_spin_unlock(&mc_mutex[node]);
 
 	return liberated;
 }
@@ -414,14 +414,14 @@ static void register_mem_chunk(data_state *state, uint32_t dst_node, size_t size
 	/* the interface was already filled by ops->allocate_data_on_node */
 	memcpy(&mc->interface, &state->interface[dst_node], sizeof(starpu_data_interface_t));
 
-	take_mutex(&mc_mutex[dst_node]);
+	pthread_spin_lock(&mc_mutex[dst_node]);
 	mem_chunk_list_push_front(mc_list[dst_node], mc);
-	release_mutex(&mc_mutex[dst_node]);
+	pthread_spin_unlock(&mc_mutex[dst_node]);
 }
 
 void request_mem_chunk_removal(data_state *state, unsigned node)
 {
-	take_mutex(&mc_mutex[node]);
+	pthread_spin_lock(&mc_mutex[node]);
 
 	/* iterate over the list of memory chunks and remove the entry */
 	mem_chunk_t mc, next_mc;
@@ -441,7 +441,7 @@ void request_mem_chunk_removal(data_state *state, unsigned node)
 			/* put it in the list of buffers to be removed */
 			mem_chunk_list_push_front(mc_list_to_free[node], mc);
 
-			release_mutex(&mc_mutex[node]);
+			pthread_spin_unlock(&mc_mutex[node]);
 
 			return;
 		}
@@ -449,7 +449,7 @@ void request_mem_chunk_removal(data_state *state, unsigned node)
 
 	/* there was no corresponding buffer ... */
 
-	release_mutex(&mc_mutex[node]);
+	pthread_spin_unlock(&mc_mutex[node]);
 }
 
 static size_t liberate_memory_on_node(mem_chunk_t mc, uint32_t node)

+ 4 - 4
src/datawizard/memory_nodes.c

@@ -97,7 +97,7 @@ void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
 	unsigned queue;
 	unsigned nqueues_total, nqueues;
 	
-	take_mutex(&descr.attached_queues_mutex);
+	pthread_spin_lock(&descr.attached_queues_mutex);
 
 	/* we only insert the queue if it's not already in the list */
 	nqueues = descr.queues_count[nodeid];
@@ -106,7 +106,7 @@ void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
 		if (descr.attached_queues_per_node[nodeid][queue] == q)
 		{
 			/* the queue is already in the list */
-			release_mutex(&descr.attached_queues_mutex);
+			pthread_spin_unlock(&descr.attached_queues_mutex);
 			return;
 		}
 	}
@@ -122,7 +122,7 @@ void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
 		if (descr.attached_queues_all[queue] == q)
 		{
 			/* the queue is already in the global list */
-			release_mutex(&descr.attached_queues_mutex);
+			pthread_spin_unlock(&descr.attached_queues_mutex);
 			return;
 		}
 	} 
@@ -131,7 +131,7 @@ void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
 	descr.attached_queues_all[nqueues_total] = q;
 	descr.total_queues_count++;
 
-	release_mutex(&descr.attached_queues_mutex);
+	pthread_spin_unlock(&descr.attached_queues_mutex);
 }
 
 

+ 1 - 1
src/datawizard/memory_nodes.h

@@ -38,7 +38,7 @@ typedef struct {
 	/* the list of queues that are attached to a given node */
 	// XXX 32 is set randomly !
 	// TODO move this 2 lists outside mem_node_descr
-	struct starpu_mutex_t attached_queues_mutex;
+	pthread_spinlock_t attached_queues_mutex;
 	struct jobq_s *attached_queues_per_node[MAXNODES][32];
 	struct jobq_s *attached_queues_all[MAXNODES*32];
 	/* the number of queues attached to each node */

+ 3 - 3
src/datawizard/progress.c

@@ -23,7 +23,7 @@ extern pthread_key_t local_workers_key;
 
 #ifdef USE_GORDON
 extern void handle_terminated_job_per_worker(struct worker_s *worker);
-extern struct starpu_mutex_t terminated_list_mutexes[32]; 
+extern pthread_spinlock_t terminated_list_mutexes[32]; 
 #endif
 
 void datawizard_progress(uint32_t memory_node)
@@ -40,9 +40,9 @@ void datawizard_progress(uint32_t memory_node)
 		unsigned worker;
 		for (worker = 0; worker < set->nworkers; worker++)
 		{
-			take_mutex(&terminated_list_mutexes[0]);
+			pthread_spin_lock(&terminated_list_mutexes[0]);
 			handle_terminated_job_per_worker(&set->workers[worker]);
-			release_mutex(&terminated_list_mutexes[0]);
+			pthread_spin_unlock(&terminated_list_mutexes[0]);
 		}
 	}
 #endif

+ 2 - 2
src/datawizard/write_back.c

@@ -25,7 +25,7 @@ void write_through_data(data_state *state, uint32_t requesting_node,
 		return;
 	}
 
-	while (take_mutex_try(&state->header_lock))
+	while (pthread_spin_trylock(&state->header_lock))
 		datawizard_progress(requesting_node);
 
 	/* first commit all changes onto the nodes specified by the mask */
@@ -59,7 +59,7 @@ void write_through_data(data_state *state, uint32_t requesting_node,
 		state->per_node[requesting_node].state = SHARED;
 	}
 
-	release_mutex(&state->header_lock);
+	pthread_spin_unlock(&state->header_lock);
 }
 
 void data_set_wb_mask(data_state *data, uint32_t wb_mask)

+ 10 - 10
src/drivers/gordon/driver_gordon.c

@@ -31,7 +31,7 @@ pthread_t progress_thread;
 pthread_cond_t progress_cond;
 pthread_mutex_t progress_mutex;
 
-struct starpu_mutex_t terminated_list_mutexes[32]; 
+pthread_spinlock_t terminated_list_mutexes[32]; 
 
 struct gordon_task_wrapper_s {
 	/* who has executed that ? */
@@ -205,13 +205,13 @@ static void handle_terminated_jobs(struct worker_set_s *arg)
 	unsigned spu;
 	for (spu = 0; spu < arg->nworkers; spu++)
 	{
-		take_mutex(&terminated_list_mutexes[spu]);
+		pthread_spin_lock(&terminated_list_mutexes[spu]);
 		handle_terminated_job_per_worker(&arg->workers[spu]);
-		release_mutex(&terminated_list_mutexes[spu]);
-		//if (!take_mutex_try(&terminated_list_mutexes[spu]))
+		pthread_spin_unlock(&terminated_list_mutexes[spu]);
+		//if (!pthread_spin_trylock(&terminated_list_mutexes[spu]))
 		//{
 		//	handle_terminated_job_per_worker(&arg->workers[spu]);
-		//	release_mutex(&terminated_list_mutexes[spu]);
+		//	pthread_spin_unlock(&terminated_list_mutexes[spu]);
 		//}
 	}
 }
@@ -237,7 +237,7 @@ static void gordon_callback_list_func(void *arg)
 	unsigned task_cnt = 0;
 
 	/* XXX 0 was hardcoded */
-	take_mutex(&terminated_list_mutexes[0]);
+	pthread_spin_lock(&terminated_list_mutexes[0]);
 	while (!job_list_empty(wrapper_list))
 	{
 		job_t j = job_list_pop_back(wrapper_list);
@@ -257,7 +257,7 @@ static void gordon_callback_list_func(void *arg)
 	/* the job list was allocated by the gordon driver itself */
 	job_list_delete(wrapper_list);
 
-	release_mutex(&terminated_list_mutexes[0]);
+	pthread_spin_unlock(&terminated_list_mutexes[0]);
 
 	wake_all_blocked_workers();
 	free(task_wrapper->gordon_job);
@@ -279,9 +279,9 @@ static void gordon_callback_func(void *arg)
 //	fprintf(stderr, "gordon callback : push job j %p\n", task_wrapper->j);
 
 	/* XXX 0 was hardcoded */
-	take_mutex(&terminated_list_mutexes[0]);
+	pthread_spin_lock(&terminated_list_mutexes[0]);
 	job_list_push_back(worker->terminated_jobs, task_wrapper->j);
-	release_mutex(&terminated_list_mutexes[0]);
+	pthread_spin_unlock(&terminated_list_mutexes[0]);
 	wake_all_blocked_workers();
 	free(task_wrapper);
 }
@@ -465,7 +465,7 @@ void *gordon_worker(void *arg)
 	unsigned spu;
 	for (spu = 0; spu < gordon_set_arg->nworkers; spu++)
 	{
-		init_mutex(&terminated_list_mutexes[spu]);
+		pthread_spin_init(&terminated_list_mutexes[spu], 0);
 	}