16 years ago · ec8baec6b1
--- a/Makefile.am
+++ b/Makefile.am
@@ -26,7 +26,6 @@ include_HEADERS = 				\
 
				 	include/starpu_config.h			\
			
 
				 	include/starpu-data-filters.h		\
			
 
				 	include/starpu-data-interfaces.h	\
			
 
				-	include/starpu-mutex.h			\
			
 
				 	include/starpu-task.h			\
			
 
				 	include/starpu-data.h			\
			
 
				 	include/starpu-perfmodel.h		\
			
--- a/include/starpu-mutex.h
+++ b/include/starpu-mutex.h
@@ -1,33 +0,0 @@
 
				-/*
			
 
				- * StarPU
			
 
				- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				- *
			
 
				- * This program is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * This program is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __STARPU_MUTEX_H__
			
 
				-#define __STARPU_MUTEX_H__
			
 
				-
			
 
				-#include <starpu_config.h>
			
 
				-#include <stdint.h>
			
 
				-
			
 
				-typedef struct starpu_mutex_t {
			
 
				-	/* we only have a trivial implementation yet ! */
			
 
				-	volatile uint32_t taken __attribute__ ((aligned(16)));
			
 
				-} starpu_mutex;
			
 
				-
			
 
				-void init_mutex(starpu_mutex *m);
			
 
				-void take_mutex(starpu_mutex *m);
			
 
				-int take_mutex_try(starpu_mutex *m);
			
 
				-void release_mutex(starpu_mutex *m);
			
 
				-
			
 
				-#endif // __STARPU_MUTEX_H__
			
--- a/include/starpu-perfmodel.h
+++ b/include/starpu-perfmodel.h
@@ -18,8 +18,8 @@
 
				 #define __STARPU_PERFMODEL_H__
			
 
				 
			
 
				 #include <stdio.h>
			
 
				+#include <pthread.h>
			
 
				 #include <starpu_config.h>
			
 
				-#include <starpu-mutex.h>
			
 
				 
			
 
				 struct starpu_htbl32_node_s;
			
 
				 struct starpu_history_list_t;
			
@@ -89,7 +89,7 @@ struct starpu_perfmodel_t {
 
				 	unsigned is_loaded;
			
 
				 	unsigned benchmarking;
			
 
				 
			
 
				-	starpu_mutex model_mutex;
			
 
				+	pthread_spinlock_t model_mutex;
			
 
				 };
			
 
				 
			
 
				 #endif // __STARPU_PERFMODEL_H__
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -84,7 +84,6 @@ libstarpu_la_SOURCES = 						\
 
				 	common/malloc.c						\
			
 
				 	common/hash.c 						\
			
 
				 	common/htable32.c					\
			
 
				-	common/mutex.c						\
			
 
				 	common/rwlock.c						\
			
 
				 	common/timing.c						\
			
 
				 	core/jobs.c						\
			
--- a/src/common/mutex.c
+++ b/src/common/mutex.c
@@ -1,43 +0,0 @@
 
				-/*
			
 
				- * StarPU
			
 
				- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				- *
			
 
				- * This program is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * This program is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu-mutex.h>
			
 
				-
			
 
				-void init_mutex(starpu_mutex *m)
			
 
				-{
			
 
				-	/* this is free at first */
			
 
				-	m->taken = 0;
			
 
				-}
			
 
				-
			
 
				-inline int take_mutex_try(starpu_mutex *m)
			
 
				-{
			
 
				-	uint32_t prev;
			
 
				-	prev = __sync_lock_test_and_set(&m->taken, 1);
			
 
				-	return (prev == 0)?0:-1;
			
 
				-}
			
 
				-
			
 
				-inline void take_mutex(starpu_mutex *m)
			
 
				-{
			
 
				-	uint32_t prev;
			
 
				-	do {
			
 
				-		prev = __sync_lock_test_and_set(&m->taken, 1);
			
 
				-	} while (prev);
			
 
				-}
			
 
				-
			
 
				-inline void release_mutex(starpu_mutex *m)
			
 
				-{
			
 
				-	m->taken = 0;
			
 
				-}
			
--- a/src/core/dependencies/data-concurrency.c
+++ b/src/core/dependencies/data-concurrency.c
@@ -63,7 +63,7 @@ unsigned attempt_to_submit_data_request_from_apps(data_state *data, starpu_acces
 
				 {
			
 
				 	unsigned ret;
			
 
				 
			
 
				-	take_mutex(&data->header_lock);
			
 
				+	pthread_spin_lock(&data->header_lock);
			
 
				 
			
 
				 	if (data->refcnt == 0)
			
 
				 	{
			
@@ -103,7 +103,7 @@ unsigned attempt_to_submit_data_request_from_apps(data_state *data, starpu_acces
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&data->header_lock);
			
 
				+	pthread_spin_unlock(&data->header_lock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -114,7 +114,7 @@ static unsigned attempt_to_submit_data_request_from_job(job_t j, unsigned buffer
 
				 	data_state *data = j->task->buffers[buffer_index].handle;
			
 
				 	starpu_access_mode mode = j->task->buffers[buffer_index].mode;
			
 
				 
			
 
				-	take_mutex(&data->header_lock);
			
 
				+	pthread_spin_lock(&data->header_lock);
			
 
				 
			
 
				 	if (data->refcnt == 0)
			
 
				 	{
			
@@ -154,7 +154,7 @@ static unsigned attempt_to_submit_data_request_from_job(job_t j, unsigned buffer
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&data->header_lock);
			
 
				+	pthread_spin_unlock(&data->header_lock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -189,7 +189,7 @@ unsigned submit_job_enforce_data_deps(job_t j)
 
				 
			
 
				 void notify_data_dependencies(data_state *data)
			
 
				 {
			
 
				-	take_mutex(&data->header_lock);
			
 
				+	pthread_spin_lock(&data->header_lock);
			
 
				 
			
 
				 	data->refcnt--;
			
 
				 
			
@@ -200,7 +200,7 @@ void notify_data_dependencies(data_state *data)
 
				 
			
 
				 		data->refcnt++;
			
 
				 	
			
 
				-		release_mutex(&data->header_lock);
			
 
				+		pthread_spin_unlock(&data->header_lock);
			
 
				 
			
 
				 		if (r->is_requested_by_codelet)
			
 
				 		{
			
@@ -217,10 +217,10 @@ void notify_data_dependencies(data_state *data)
 
				 
			
 
				 		data_requester_delete(r);
			
 
				 		
			
 
				-		take_mutex(&data->header_lock);
			
 
				+		pthread_spin_lock(&data->header_lock);
			
 
				 	}
			
 
				 	
			
 
				-	release_mutex(&data->header_lock);
			
 
				+	pthread_spin_unlock(&data->header_lock);
			
 
				 
			
 
				 }
			
 
				 
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -24,9 +24,12 @@
 
				 #include <starpu.h>
			
 
				 
			
 
				 static htbl_node_t *tag_htbl = NULL;
			
 
				-static starpu_mutex tag_mutex = {
			
 
				-	.taken = 0
			
 
				-};
			
 
				+pthread_spinlock_t tag_mutex;
			
 
				+
			
 
				+void initialize_tag_mutex(void)
			
 
				+{
			
 
				+	pthread_spin_init(&tag_mutex, 0);
			
 
				+}
			
 
				 
			
 
				 static cg_t *create_cg(unsigned ntags, struct tag_s *tag, unsigned is_apps_cg)
			
 
				 {
			
@@ -76,7 +79,7 @@ static struct tag_s *tag_init(starpu_tag_t id)
 
				 	tag->succ = realloc(NULL, tag->succ_list_size*sizeof(struct _cg_t *));
			
 
				 #endif
			
 
				 
			
 
				-	init_mutex(&tag->lock);
			
 
				+	pthread_spin_init(&tag->lock, 0);
			
 
				 
			
 
				 	return tag;
			
 
				 }
			
@@ -85,27 +88,27 @@ void starpu_tag_remove(starpu_tag_t id)
 
				 {
			
 
				 	struct tag_s *tag;
			
 
				 
			
 
				-	take_mutex(&tag_mutex);
			
 
				+	pthread_spin_lock(&tag_mutex);
			
 
				 
			
 
				 	tag = htbl_remove_tag(tag_htbl, id);
			
 
				 
			
 
				-	release_mutex(&tag_mutex);
			
 
				+	pthread_spin_unlock(&tag_mutex);
			
 
				 
			
 
				-	take_mutex(&tag->lock);
			
 
				+	pthread_spin_lock(&tag->lock);
			
 
				 	
			
 
				 #ifdef DYNAMIC_DEPS_SIZE
			
 
				 	if (tag)
			
 
				 		free(tag->succ);
			
 
				 #endif
			
 
				 
			
 
				-	release_mutex(&tag->lock);
			
 
				+	pthread_spin_unlock(&tag->lock);
			
 
				 
			
 
				 	free(tag);
			
 
				 }
			
 
				 
			
 
				 static struct tag_s *gettag_struct(starpu_tag_t id)
			
 
				 {
			
 
				-	take_mutex(&tag_mutex);
			
 
				+	pthread_spin_lock(&tag_mutex);
			
 
				 
			
 
				 	/* search if the tag is already declared or not */
			
 
				 	struct tag_s *tag;
			
@@ -121,7 +124,7 @@ static struct tag_s *gettag_struct(starpu_tag_t id)
 
				 		STARPU_ASSERT(old == NULL);
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&tag_mutex);
			
 
				+	pthread_spin_unlock(&tag_mutex);
			
 
				 
			
 
				 	return tag;
			
 
				 }
			
@@ -129,14 +132,14 @@ static struct tag_s *gettag_struct(starpu_tag_t id)
 
				 /* lock should be taken */
			
 
				 static void tag_set_ready(struct tag_s *tag)
			
 
				 {
			
 
				-//	take_mutex(&tag->lock);
			
 
				+//	pthread_spin_lock(&tag->lock);
			
 
				 
			
 
				 	/* mark this tag as ready to run */
			
 
				 	tag->state = READY;
			
 
				 	/* declare it to the scheduler ! */
			
 
				 	struct job_s *j = tag->job;
			
 
				 
			
 
				-//	release_mutex(&tag->lock);
			
 
				+//	pthread_spin_unlock(&tag->lock);
			
 
				 
			
 
				 #ifdef NO_DATA_RW_LOCK
			
 
				 	/* enforce data dependencies */
			
@@ -165,14 +168,14 @@ static void notify_cg(cg_t *cg)
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-//			take_mutex(&cg->tag->lock);
			
 
				+//			pthread_spin_lock(&cg->tag->lock);
			
 
				 			struct tag_s *tag = cg->tag;
			
 
				 			tag->ndeps_completed++;
			
 
				 
			
 
				 			if ((tag->state == BLOCKED) 
			
 
				 				&& (tag->ndeps == tag->ndeps_completed))
			
 
				 				tag_set_ready(cg->tag);
			
 
				-//			release_mutex(&cg->tag->lock);
			
 
				+//			pthread_spin_unlock(&cg->tag->lock);
			
 
				 
			
 
				 			free(cg);
			
 
				 		}
			
@@ -208,7 +211,7 @@ static void tag_add_succ(struct tag_s *tag, cg_t *cg)
 
				 		tag->succ[index] = cg;
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&tag->lock);
			
 
				+	pthread_spin_unlock(&tag->lock);
			
 
				 }
			
 
				 
			
 
				 void notify_dependencies(struct job_s *j)
			
@@ -223,7 +226,7 @@ void notify_dependencies(struct job_s *j)
 
				 		/* in case there are dependencies, wake up the proper tasks */
			
 
				 		tag = j->tag;
			
 
				 
			
 
				-		take_mutex(&tag->lock);
			
 
				+		pthread_spin_lock(&tag->lock);
			
 
				 
			
 
				 		tag->state = DONE;
			
 
				 		TRACE_TASK_DONE(tag->id);
			
@@ -237,15 +240,15 @@ void notify_dependencies(struct job_s *j)
 
				 			struct tag_s *cgtag = cg->tag;
			
 
				 
			
 
				 			if (!used_by_apps)
			
 
				-				take_mutex(&cgtag->lock);
			
 
				+				pthread_spin_lock(&cgtag->lock);
			
 
				 
			
 
				 			notify_cg(cg);
			
 
				 
			
 
				 			if (!used_by_apps)
			
 
				-				release_mutex(&cgtag->lock);
			
 
				+				pthread_spin_unlock(&cgtag->lock);
			
 
				 		}
			
 
				 
			
 
				-		release_mutex(&tag->lock);
			
 
				+		pthread_spin_unlock(&tag->lock);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -271,7 +274,7 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 
				 	/* create the associated completion group */
			
 
				 	struct tag_s *tag_child = gettag_struct(id);
			
 
				 
			
 
				-	take_mutex(&tag_child->lock);
			
 
				+	pthread_spin_lock(&tag_child->lock);
			
 
				 
			
 
				 	cg_t *cg = create_cg(ndeps, tag_child, 0);
			
 
				 
			
@@ -285,12 +288,12 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 
				 		 * so cg should be among dep_id's successors*/
			
 
				 		TRACE_CODELET_TAG_DEPS(id, dep_id);
			
 
				 		struct tag_s *tag_dep = gettag_struct(dep_id);
			
 
				-		take_mutex(&tag_dep->lock);
			
 
				+		pthread_spin_lock(&tag_dep->lock);
			
 
				 		tag_add_succ(tag_dep, cg);
			
 
				-		release_mutex(&tag_dep->lock);
			
 
				+		pthread_spin_unlock(&tag_dep->lock);
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&tag_child->lock);
			
 
				+	pthread_spin_unlock(&tag_child->lock);
			
 
				 }
			
 
				 
			
 
				 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
			
@@ -300,7 +303,7 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 
				 	/* create the associated completion group */
			
 
				 	struct tag_s *tag_child = gettag_struct(id);
			
 
				 
			
 
				-	take_mutex(&tag_child->lock);
			
 
				+	pthread_spin_lock(&tag_child->lock);
			
 
				 
			
 
				 	cg_t *cg = create_cg(ndeps, tag_child, 0);
			
 
				 
			
@@ -318,13 +321,13 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 
				 		 * so cg should be among dep_id's successors*/
			
 
				 		TRACE_CODELET_TAG_DEPS(id, dep_id);
			
 
				 		struct tag_s *tag_dep = gettag_struct(dep_id);
			
 
				-		take_mutex(&tag_dep->lock);
			
 
				+		pthread_spin_lock(&tag_dep->lock);
			
 
				 		tag_add_succ(tag_dep, cg);
			
 
				-		release_mutex(&tag_dep->lock);
			
 
				+		pthread_spin_unlock(&tag_dep->lock);
			
 
				 	}
			
 
				 	va_end(pa);
			
 
				 
			
 
				-	release_mutex(&tag_child->lock);
			
 
				+	pthread_spin_unlock(&tag_child->lock);
			
 
				 }
			
 
				 
			
 
				 /* this function may be called by the application (outside callbacks !) */
			
@@ -340,12 +343,12 @@ void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 
				 	{
			
 
				 		struct tag_s *tag = gettag_struct(id[i]);
			
 
				 		
			
 
				-		take_mutex(&tag->lock);
			
 
				+		pthread_spin_lock(&tag->lock);
			
 
				 
			
 
				 		if (tag->state == DONE)
			
 
				 		{
			
 
				 			/* that tag is done already */
			
 
				-			release_mutex(&tag->lock);
			
 
				+			pthread_spin_unlock(&tag->lock);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -366,7 +369,7 @@ void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 
				 	for (i = 0; i < current; i++)
			
 
				 	{
			
 
				 		tag_add_succ(tag_array[i], cg);
			
 
				-		release_mutex(&tag_array[i]->lock);
			
 
				+		pthread_spin_unlock(&tag_array[i]->lock);
			
 
				 	}
			
 
				 
			
 
				 	pthread_mutex_lock(&cg->cg_mutex);
			
--- a/src/core/dependencies/tags.h
+++ b/src/core/dependencies/tags.h
@@ -18,7 +18,7 @@
 
				 #define __TAGS_H__
			
 
				 
			
 
				 #include <stdint.h>
			
 
				-#include <starpu-mutex.h>
			
 
				+#include <pthread.h>
			
 
				 #include <core/jobs.h>
			
 
				 
			
 
				 /* we do not necessarily want to allocate room for 256 dependencies, but we
			
@@ -53,7 +53,7 @@ typedef enum {
 
				 struct job_s;
			
 
				 
			
 
				 struct tag_s {
			
 
				-	starpu_mutex lock;
			
 
				+	pthread_spinlock_t lock;
			
 
				 	starpu_tag_t id; /* an identifier for the task */
			
 
				 	tag_state state;
			
 
				 	unsigned nsuccs; /* how many successors ? */
			
@@ -85,6 +85,8 @@ typedef struct _cg_t {
 
				 	pthread_cond_t cg_cond;
			
 
				 } cg_t;
			
 
				 
			
 
				+void initialize_tag_mutex(void);
			
 
				+
			
 
				 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
			
 
				 
			
 
				 void notify_dependencies(struct job_s *j);
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -165,7 +165,7 @@ static unsigned not_all_task_deps_are_fulfilled(job_t j)
 
				 
			
 
				 	struct tag_s *tag = j->tag;
			
 
				 
			
 
				-	take_mutex(&tag->lock);
			
 
				+	pthread_spin_lock(&tag->lock);
			
 
				 
			
 
				 	if (tag->ndeps != tag->ndeps_completed)
			
 
				 	{
			
@@ -178,7 +178,7 @@ static unsigned not_all_task_deps_are_fulfilled(job_t j)
 
				 		ret = 0;
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&tag->lock);
			
 
				+	pthread_spin_unlock(&tag->lock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -22,7 +22,7 @@
 
				 //#include <core/jobs.h>
			
 
				 #include <common/htable32.h>
			
 
				 //#include <core/workers.h>
			
 
				-#include <starpu-mutex.h>
			
 
				+#include <pthread.h>
			
 
				 #include <stdio.h>
			
 
				 
			
 
				 struct starpu_buffer_descr_t;
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -22,7 +22,7 @@
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				 #include <core/jobs.h>
			
 
				 #include <core/workers.h>
			
 
				-#include <starpu-mutex.h>
			
 
				+#include <pthread.h>
			
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <core/perfmodel/regression.h>
			
 
				 #include <common/config.h>
			
@@ -307,7 +307,7 @@ void load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_hi
 
				 
			
 
				 	/* XXX we assume the lock is implicitely initialized (taken = 0) */
			
 
				 	//init_mutex(&model->model_mutex);
			
 
				-	take_mutex(&model->model_mutex);
			
 
				+	pthread_spin_lock(&model->model_mutex);
			
 
				 
			
 
				 	/* perhaps some other thread got in before ... */
			
 
				 	if (!model->is_loaded)
			
@@ -364,7 +364,7 @@ void load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_hi
 
				 		model->is_loaded = 1;
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&model->model_mutex);
			
 
				+	pthread_spin_unlock(&model->model_mutex);
			
 
				 }
			
 
				 
			
 
				 double regression_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct job_s *j)
			
@@ -405,9 +405,9 @@ double history_based_job_expected_length(struct starpu_perfmodel_t *model, enum
 
				 	if (!history)
			
 
				 		return -1.0;
			
 
				 
			
 
				-	take_mutex(&model->model_mutex);
			
 
				+	pthread_spin_lock(&model->model_mutex);
			
 
				 	entry = htbl_search_32(history, key);
			
 
				-	release_mutex(&model->model_mutex);
			
 
				+	pthread_spin_unlock(&model->model_mutex);
			
 
				 
			
 
				 	exp = entry?entry->mean:-1.0;
			
 
				 
			
@@ -439,7 +439,7 @@ void update_perfmodel_history(job_t j, enum starpu_perf_archtype arch, double me
 
				 			reg_model = &per_arch_model->regression;
			
 
				 			list = &per_arch_model->list;
			
 
				 
			
 
				-			take_mutex(&model->model_mutex);
			
 
				+			pthread_spin_lock(&model->model_mutex);
			
 
				 	
			
 
				 				entry = htbl_search_32(history, key);
			
 
				 	
			
@@ -494,13 +494,13 @@ void update_perfmodel_history(job_t j, enum starpu_perf_archtype arch, double me
 
				 			reg_model->beta = num/denom;
			
 
				 			reg_model->alpha = expl((reg_model->sumlny - reg_model->beta*reg_model->sumlnx)/n);
			
 
				 			
			
 
				-			release_mutex(&model->model_mutex);
			
 
				+			pthread_spin_unlock(&model->model_mutex);
			
 
				 		}
			
 
				 
			
 
				 #ifdef MODEL_DEBUG
			
 
				 		FILE * debug_file = per_arch_model->debug_file;
			
 
				 
			
 
				-		take_mutex(&model->model_mutex);
			
 
				+		pthread_spin_lock(&model->model_mutex);
			
 
				 
			
 
				 		STARPU_ASSERT(j->footprint_is_computed);
			
 
				 
			
@@ -519,7 +519,7 @@ void update_perfmodel_history(job_t j, enum starpu_perf_archtype arch, double me
 
				 		fprintf(debug_file, "\n");	
			
 
				 
			
 
				 
			
 
				-		release_mutex(&model->model_mutex);
			
 
				+		pthread_spin_unlock(&model->model_mutex);
			
 
				 #endif
			
 
				 	}
			
 
				 }
			
--- a/src/core/policies/sched_policy.c
+++ b/src/core/policies/sched_policy.c
@@ -121,7 +121,7 @@ void init_sched_policy(struct machine_config_s *config, struct starpu_conf *user
 
				 	pthread_cond_init(&policy.sched_activity_cond, NULL);
			
 
				 	pthread_mutex_init(&policy.sched_activity_mutex, NULL);
			
 
				 	pthread_key_create(&policy.local_queue_key, NULL);
			
 
				-	init_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_init(&descr.attached_queues_mutex, 0);
			
 
				 	descr.total_queues_count = 0;
			
 
				 
			
 
				 	policy.init_sched(config, &policy);
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -369,6 +369,8 @@ void starpu_init(struct starpu_conf *user_conf)
 
				 
			
 
				 	init_workers_binding(&config);
			
 
				 
			
 
				+	initialize_tag_mutex();
			
 
				+
			
 
				 	/* initialize the scheduler */
			
 
				 
			
 
				 	/* initialize the queue containing the jobs */
			
@@ -445,7 +447,7 @@ static void operate_on_all_queues_attached_to_node(unsigned nodeid, queue_op op)
 
				 	unsigned q_id;
			
 
				 	struct jobq_s *q;
			
 
				 
			
 
				-	take_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_lock(&descr.attached_queues_mutex);
			
 
				 
			
 
				 	unsigned nqueues = descr.queues_count[nodeid];
			
 
				 
			
@@ -465,7 +467,7 @@ static void operate_on_all_queues_attached_to_node(unsigned nodeid, queue_op op)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_unlock(&descr.attached_queues_mutex);
			
 
				 }
			
 
				 
			
 
				 inline void lock_all_queues_attached_to_node(unsigned node)
			
@@ -488,7 +490,7 @@ static void operate_on_all_queues(queue_op op)
 
				 	unsigned q_id;
			
 
				 	struct jobq_s *q;
			
 
				 
			
 
				-	take_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_lock(&descr.attached_queues_mutex);
			
 
				 
			
 
				 	unsigned nqueues = descr.total_queues_count;
			
 
				 
			
@@ -508,7 +510,7 @@ static void operate_on_all_queues(queue_op op)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_unlock(&descr.attached_queues_mutex);
			
 
				 }
			
 
				 
			
 
				 static void kill_all_workers(struct machine_config_s *config)
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -96,7 +96,7 @@ static void update_data_state(data_state *state, uint32_t requesting_node,
 
				 int _fetch_data(data_state *state, uint32_t requesting_node,
			
 
				 			uint8_t read, uint8_t write)
			
 
				 {
			
 
				-	while (take_mutex_try(&state->header_lock)) {
			
 
				+	while (pthread_spin_trylock(&state->header_lock)) {
			
 
				 		datawizard_progress(requesting_node);
			
 
				 	}
			
 
				 
			
@@ -107,7 +107,7 @@ int _fetch_data(data_state *state, uint32_t requesting_node,
 
				 	if ((local_state == OWNER) || (local_state == SHARED && !write))
			
 
				 	{
			
 
				 		/* the local node already got its data */
			
 
				-		release_mutex(&state->header_lock);
			
 
				+		pthread_spin_unlock(&state->header_lock);
			
 
				 		msi_cache_hit(requesting_node);
			
 
				 		return 0;
			
 
				 	}
			
@@ -126,7 +126,7 @@ int _fetch_data(data_state *state, uint32_t requesting_node,
 
				 
			
 
				 		}
			
 
				 		
			
 
				-		release_mutex(&state->header_lock);
			
 
				+		pthread_spin_unlock(&state->header_lock);
			
 
				 		msi_cache_hit(requesting_node);
			
 
				 		return 0;
			
 
				 	}
			
@@ -150,13 +150,13 @@ int _fetch_data(data_state *state, uint32_t requesting_node,
 
				 
			
 
				 	update_data_state(state, requesting_node, write);
			
 
				 
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 
			
 
				 	return 0;
			
 
				 
			
 
				 enomem:
			
 
				 	/* there was not enough local memory to fetch the data */
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 	return -ENOMEM;
			
 
				 }
			
 
				 
			
@@ -181,11 +181,11 @@ static int fetch_data(data_state *state, starpu_access_mode mode)
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	while (take_mutex_try(&state->header_lock))
			
 
				+	while (pthread_spin_trylock(&state->header_lock))
			
 
				 		datawizard_progress(requesting_node);
			
 
				 
			
 
				 	state->per_node[requesting_node].refcnt++;
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 
			
 
				 	ret = _fetch_data(state, requesting_node, read, write);
			
 
				 	if (ret != 0)
			
@@ -194,11 +194,11 @@ static int fetch_data(data_state *state, starpu_access_mode mode)
 
				 	return 0;
			
 
				 enomem:
			
 
				 	/* we did not get the data so remove the lock anyway */
			
 
				-	while (take_mutex_try(&state->header_lock))
			
 
				+	while (pthread_spin_trylock(&state->header_lock))
			
 
				 		datawizard_progress(requesting_node);
			
 
				 
			
 
				 	state->per_node[requesting_node].refcnt--;
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 
			
 
				 #ifndef NO_DATA_RW_LOCK
			
 
				 	release_rw_lock(&state->data_lock);
			
@@ -229,11 +229,11 @@ static void release_data(data_state *state, uint32_t default_wb_mask)
 
				 		write_through_data(state, requesting_node, wb_mask);
			
 
				 	}
			
 
				 
			
 
				-	while (take_mutex_try(&state->header_lock))
			
 
				+	while (pthread_spin_trylock(&state->header_lock))
			
 
				 		datawizard_progress(requesting_node);
			
 
				 
			
 
				 	state->per_node[requesting_node].refcnt--;
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 
			
 
				 #ifndef NO_DATA_RW_LOCK
			
 
				 	/* this is intended to make data accessible again */
			
@@ -294,7 +294,7 @@ void push_codelet_output(starpu_buffer_descr *descrs, unsigned nbuffers, uint32_
 
				 
			
 
				 int request_data_allocation(data_state *state, uint32_t node)
			
 
				 {
			
 
				-	take_mutex(&state->header_lock);
			
 
				+	pthread_spin_lock(&state->header_lock);
			
 
				 
			
 
				 	int ret;
			
 
				 	ret = allocate_per_node_buffer(state, node);
			
@@ -303,7 +303,7 @@ int request_data_allocation(data_state *state, uint32_t node)
 
				 	/* XXX quick and dirty hack */
			
 
				 	state->per_node[node].automatically_allocated = 0;	
			
 
				 
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -382,7 +382,7 @@ void starpu_sync_data_with_mem(data_state *state)
 
				 
			
 
				 static inline void do_notify_data_modification(data_state *state, uint32_t modifying_node)
			
 
				 {
			
 
				-	take_mutex(&state->header_lock);
			
 
				+	pthread_spin_lock(&state->header_lock);
			
 
				 
			
 
				 	unsigned node = 0;
			
 
				 	for (node = 0; node < MAXNODES; node++)
			
@@ -391,7 +391,7 @@ static inline void do_notify_data_modification(data_state *state, uint32_t modif
 
				 			(node == modifying_node?OWNER:INVALID);
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 }
			
 
				 
			
 
				 #ifdef NO_DATA_RW_LOCK
			
@@ -453,13 +453,13 @@ unsigned is_data_present_or_requested(data_state *state, uint32_t node)
 
				 	unsigned ret = 0;
			
 
				 
			
 
				 // XXX : this is just a hint, so we don't take the lock ...
			
 
				-//	take_mutex(&state->header_lock);
			
 
				+//	pthread_spin_lock(&state->header_lock);
			
 
				 
			
 
				 	if (state->per_node[node].state != INVALID 
			
 
				 		|| state->per_node[node].requested)
			
 
				 		ret = 1;
			
 
				 
			
 
				-//	release_mutex(&state->header_lock);
			
 
				+//	pthread_spin_unlock(&state->header_lock);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -467,10 +467,10 @@ unsigned is_data_present_or_requested(data_state *state, uint32_t node)
 
				 inline void set_data_requested_flag_if_needed(data_state *state, uint32_t node)
			
 
				 {
			
 
				 // XXX : this is just a hint, so we don't take the lock ...
			
 
				-//	take_mutex(&state->header_lock);
			
 
				+//	pthread_spin_lock(&state->header_lock);
			
 
				 
			
 
				 	if (state->per_node[node].state == INVALID) 
			
 
				 		state->per_node[node].requested = 1;
			
 
				 
			
 
				-//	release_mutex(&state->header_lock);
			
 
				+//	pthread_spin_unlock(&state->header_lock);
			
 
				 }
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -26,7 +26,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#include <starpu-mutex.h>
			
 
				+#include <pthread.h>
			
 
				 #include <common/rwlock.h>
			
 
				 #include <common/timing.h>
			
 
				 #include <common/fxt.h>
			
@@ -106,7 +106,7 @@ typedef struct starpu_data_state_t {
 
				 	rw_lock	data_lock;
			
 
				 #endif
			
 
				 	/* protect meta data */
			
 
				-	starpu_mutex header_lock;
			
 
				+	pthread_spinlock_t header_lock;
			
 
				 
			
 
				 	uint32_t nnodes; /* the number of memory nodes that may use it */
			
 
				 	struct starpu_data_state_t *children;
			
--- a/src/datawizard/copy-driver.c
+++ b/src/datawizard/copy-driver.c
@@ -30,7 +30,7 @@ void wake_all_blocked_workers_on_node(unsigned nodeid)
 
				 	/* wake up all queues on that node */
			
 
				 	unsigned q_id;
			
 
				 
			
 
				-	take_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_lock(&descr.attached_queues_mutex);
			
 
				 
			
 
				 	unsigned nqueues = descr.queues_count[nodeid];
			
 
				 	for (q_id = 0; q_id < nqueues; q_id++)
			
@@ -44,7 +44,7 @@ void wake_all_blocked_workers_on_node(unsigned nodeid)
 
				 		pthread_mutex_unlock(&q->activity_mutex);
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_unlock(&descr.attached_queues_mutex);
			
 
				 }
			
 
				 
			
 
				 void wake_all_blocked_workers(void)
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -70,7 +70,7 @@ int post_data_request(data_state *state, uint32_t src_node, uint32_t dst_node)
 
				 	 * TODO: handle the situation of a possible invalidation caused by
			
 
				 	 * memory eviction mechanism. This could be done by the means of a
			
 
				 	 * specific state (or flag) in the MSI protocol. */
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 #endif
			
 
				 
			
 
				 //	/* wait for the request to be performed */
			
@@ -96,7 +96,7 @@ int post_data_request(data_state *state, uint32_t src_node, uint32_t dst_node)
 
				 	pthread_mutex_unlock(&data_requests_list_mutex[src_node]);
			
 
				 
			
 
				 #ifdef NO_DATA_RW_LOCK
			
 
				-	take_mutex(&state->header_lock);
			
 
				+	pthread_spin_lock(&state->header_lock);
			
 
				 #endif
			
 
				 
			
 
				 	retvalue = r->retval;
			
--- a/src/datawizard/hierarchy.c
+++ b/src/datawizard/hierarchy.c
@@ -51,10 +51,10 @@ void register_new_data(data_state *state, uint32_t home_node, uint32_t wb_mask)
 
				 	state->req_list = data_requester_list_new();
			
 
				 	state->refcnt = 0;
			
 
				 #endif
			
 
				-	init_mutex(&state->header_lock);
			
 
				+	pthread_spin_init(&state->header_lock, 0);
			
 
				 
			
 
				 	/* first take care to properly lock the data */
			
 
				-	take_mutex(&state->header_lock);
			
 
				+	pthread_spin_lock(&state->header_lock);
			
 
				 
			
 
				 	/* we assume that all nodes may use that data */
			
 
				 	state->nnodes = MAXNODES;
			
@@ -90,7 +90,7 @@ void register_new_data(data_state *state, uint32_t home_node, uint32_t wb_mask)
 
				 	}
			
 
				 
			
 
				 	/* now the data is available ! */
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -167,7 +167,7 @@ void starpu_partition_data(data_state *initial_data, starpu_filter *f)
 
				 	int i;
			
 
				 
			
 
				 	/* first take care to properly lock the data header */
			
 
				-	take_mutex(&initial_data->header_lock);
			
 
				+	pthread_spin_lock(&initial_data->header_lock);
			
 
				 
			
 
				 	/* there should not be mutiple filters applied on the same data */
			
 
				 	STARPU_ASSERT(initial_data->nchildren == 0);
			
@@ -202,7 +202,7 @@ void starpu_partition_data(data_state *initial_data, starpu_filter *f)
 
				 		children->req_list = data_requester_list_new();
			
 
				 		children->refcnt = 0;
			
 
				 #endif
			
 
				-		init_mutex(&children->header_lock);
			
 
				+		pthread_spin_init(&children->header_lock, 0);
			
 
				 
			
 
				 		unsigned node;
			
 
				 		for (node = 0; node < MAXNODES; node++)
			
@@ -217,7 +217,7 @@ void starpu_partition_data(data_state *initial_data, starpu_filter *f)
 
				 	}
			
 
				 
			
 
				 	/* now let the header */
			
 
				-	release_mutex(&initial_data->header_lock);
			
 
				+	pthread_spin_unlock(&initial_data->header_lock);
			
 
				 }
			
 
				 
			
 
				 void starpu_unpartition_data(data_state *root_data, uint32_t gathering_node)
			
@@ -225,7 +225,7 @@ void starpu_unpartition_data(data_state *root_data, uint32_t gathering_node)
 
				 	int child;
			
 
				 	unsigned node;
			
 
				 
			
 
				-	take_mutex(&root_data->header_lock);
			
 
				+	pthread_spin_lock(&root_data->header_lock);
			
 
				 
			
 
				 #ifdef NO_DATA_RW_LOCK
			
 
				 #warning starpu_unpartition_data is not supported with NO_DATA_RW_LOCK yet ...
			
@@ -302,13 +302,13 @@ void starpu_unpartition_data(data_state *root_data, uint32_t gathering_node)
 
				 	root_data->nchildren = 0;
			
 
				 
			
 
				 	/* now the parent may be used again so we release the lock */
			
 
				-	release_mutex(&root_data->header_lock);
			
 
				+	pthread_spin_unlock(&root_data->header_lock);
			
 
				 }
			
 
				 
			
 
				 void starpu_advise_if_data_is_important(data_state *state, unsigned is_important)
			
 
				 {
			
 
				 
			
 
				-	take_mutex(&state->header_lock);
			
 
				+	pthread_spin_lock(&state->header_lock);
			
 
				 
			
 
				 	/* first take all the children lock (in order !) */
			
 
				 	int child;
			
@@ -322,6 +322,6 @@ void starpu_advise_if_data_is_important(data_state *state, unsigned is_important
 
				 	state->is_not_important = !is_important;
			
 
				 
			
 
				 	/* now the parent may be used again so we release the lock */
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 
			
 
				 }
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -18,7 +18,7 @@
 
				 #include <datawizard/footprint.h>
			
 
				 
			
 
				 extern mem_node_descr descr;
			
 
				-static starpu_mutex mc_mutex[MAXNODES]; 
			
 
				+pthread_spinlock_t mc_mutex[MAXNODES]; 
			
 
				 static mem_chunk_list_t mc_list[MAXNODES];
			
 
				 static mem_chunk_list_t mc_list_to_free[MAXNODES];
			
 
				 
			
@@ -29,7 +29,7 @@ void init_mem_chunk_lists(void)
 
				 	unsigned i;
			
 
				 	for (i = 0; i < MAXNODES; i++)
			
 
				 	{
			
 
				-		init_mutex(&mc_mutex[i]);
			
 
				+		pthread_spin_init(&mc_mutex[i], 0);
			
 
				 		mc_list[i] = mem_chunk_list_new();
			
 
				 		mc_list_to_free[i] = mem_chunk_list_new();
			
 
				 	}
			
@@ -50,7 +50,7 @@ static void lock_all_subtree(data_state *data)
 
				 	if (data->nchildren == 0)
			
 
				 	{
			
 
				 		/* this is a leaf */	
			
 
				-		while (take_mutex_try(&data->header_lock))
			
 
				+		while (pthread_spin_trylock(&data->header_lock))
			
 
				 			datawizard_progress(get_local_memory_node());
			
 
				 	}
			
 
				 	else {
			
@@ -68,7 +68,7 @@ static void unlock_all_subtree(data_state *data)
 
				 	if (data->nchildren == 0)
			
 
				 	{
			
 
				 		/* this is a leaf */	
			
 
				-		release_mutex(&data->header_lock);
			
 
				+		pthread_spin_unlock(&data->header_lock);
			
 
				 	}
			
 
				 	else {
			
 
				 		/* lock all sub-subtrees children */
			
@@ -291,7 +291,7 @@ static unsigned try_to_reuse_mem_chunk(mem_chunk_t mc, unsigned node, data_state
 
				  * list of mem chunk that need to be liberated */
			
 
				 static unsigned try_to_find_reusable_mem_chunk(unsigned node, data_state *data, uint32_t footprint)
			
 
				 {
			
 
				-	take_mutex(&mc_mutex[node]);
			
 
				+	pthread_spin_lock(&mc_mutex[node]);
			
 
				 
			
 
				 	/* go through all buffers for which there was a removal request */
			
 
				 	mem_chunk_t mc, next_mc;
			
@@ -312,7 +312,7 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, data_state *data,
 
				 			{
			
 
				 				reuse_mem_chunk(node, data, mc, 0);
			
 
				 
			
 
				-				release_mutex(&mc_mutex[node]);
			
 
				+				pthread_spin_unlock(&mc_mutex[node]);
			
 
				 				return 1;
			
 
				 			}
			
 
				 		}
			
@@ -334,13 +334,13 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, data_state *data,
 
				 //			fprintf(stderr, "found a candidate ...\n");
			
 
				 			if (try_to_reuse_mem_chunk(mc, node, data, 1))
			
 
				 			{
			
 
				-				release_mutex(&mc_mutex[node]);
			
 
				+				pthread_spin_unlock(&mc_mutex[node]);
			
 
				 				return 1;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&mc_mutex[node]);
			
 
				+	pthread_spin_unlock(&mc_mutex[node]);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -356,7 +356,7 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 
				 
			
 
				 	size_t liberated = 0;
			
 
				 
			
 
				-	take_mutex(&mc_mutex[node]);
			
 
				+	pthread_spin_lock(&mc_mutex[node]);
			
 
				 
			
 
				 	/* remove all buffers for which there was a removal request */
			
 
				 	mem_chunk_t mc, next_mc;
			
@@ -392,7 +392,7 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 
				 
			
 
				 //	fprintf(stderr, "got %d MB back\n", (int)liberated/(1024*1024));
			
 
				 
			
 
				-	release_mutex(&mc_mutex[node]);
			
 
				+	pthread_spin_unlock(&mc_mutex[node]);
			
 
				 
			
 
				 	return liberated;
			
 
				 }
			
@@ -414,14 +414,14 @@ static void register_mem_chunk(data_state *state, uint32_t dst_node, size_t size
 
				 	/* the interface was already filled by ops->allocate_data_on_node */
			
 
				 	memcpy(&mc->interface, &state->interface[dst_node], sizeof(starpu_data_interface_t));
			
 
				 
			
 
				-	take_mutex(&mc_mutex[dst_node]);
			
 
				+	pthread_spin_lock(&mc_mutex[dst_node]);
			
 
				 	mem_chunk_list_push_front(mc_list[dst_node], mc);
			
 
				-	release_mutex(&mc_mutex[dst_node]);
			
 
				+	pthread_spin_unlock(&mc_mutex[dst_node]);
			
 
				 }
			
 
				 
			
 
				 void request_mem_chunk_removal(data_state *state, unsigned node)
			
 
				 {
			
 
				-	take_mutex(&mc_mutex[node]);
			
 
				+	pthread_spin_lock(&mc_mutex[node]);
			
 
				 
			
 
				 	/* iterate over the list of memory chunks and remove the entry */
			
 
				 	mem_chunk_t mc, next_mc;
			
@@ -441,7 +441,7 @@ void request_mem_chunk_removal(data_state *state, unsigned node)
 
				 			/* put it in the list of buffers to be removed */
			
 
				 			mem_chunk_list_push_front(mc_list_to_free[node], mc);
			
 
				 
			
 
				-			release_mutex(&mc_mutex[node]);
			
 
				+			pthread_spin_unlock(&mc_mutex[node]);
			
 
				 
			
 
				 			return;
			
 
				 		}
			
@@ -449,7 +449,7 @@ void request_mem_chunk_removal(data_state *state, unsigned node)
 
				 
			
 
				 	/* there was no corresponding buffer ... */
			
 
				 
			
 
				-	release_mutex(&mc_mutex[node]);
			
 
				+	pthread_spin_unlock(&mc_mutex[node]);
			
 
				 }
			
 
				 
			
 
				 static size_t liberate_memory_on_node(mem_chunk_t mc, uint32_t node)
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -97,7 +97,7 @@ void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
 
				 	unsigned queue;
			
 
				 	unsigned nqueues_total, nqueues;
			
 
				 	
			
 
				-	take_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_lock(&descr.attached_queues_mutex);
			
 
				 
			
 
				 	/* we only insert the queue if it's not already in the list */
			
 
				 	nqueues = descr.queues_count[nodeid];
			
@@ -106,7 +106,7 @@ void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
 
				 		if (descr.attached_queues_per_node[nodeid][queue] == q)
			
 
				 		{
			
 
				 			/* the queue is already in the list */
			
 
				-			release_mutex(&descr.attached_queues_mutex);
			
 
				+			pthread_spin_unlock(&descr.attached_queues_mutex);
			
 
				 			return;
			
 
				 		}
			
 
				 	}
			
@@ -122,7 +122,7 @@ void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
 
				 		if (descr.attached_queues_all[queue] == q)
			
 
				 		{
			
 
				 			/* the queue is already in the global list */
			
 
				-			release_mutex(&descr.attached_queues_mutex);
			
 
				+			pthread_spin_unlock(&descr.attached_queues_mutex);
			
 
				 			return;
			
 
				 		}
			
 
				 	} 
			
@@ -131,7 +131,7 @@ void memory_node_attach_queue(struct jobq_s *q, unsigned nodeid)
 
				 	descr.attached_queues_all[nqueues_total] = q;
			
 
				 	descr.total_queues_count++;
			
 
				 
			
 
				-	release_mutex(&descr.attached_queues_mutex);
			
 
				+	pthread_spin_unlock(&descr.attached_queues_mutex);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h
@@ -38,7 +38,7 @@ typedef struct {
 
				 	/* the list of queues that are attached to a given node */
			
 
				 	// XXX 32 is set randomly !
			
 
				 	// TODO move this 2 lists outside mem_node_descr
			
 
				-	struct starpu_mutex_t attached_queues_mutex;
			
 
				+	pthread_spinlock_t attached_queues_mutex;
			
 
				 	struct jobq_s *attached_queues_per_node[MAXNODES][32];
			
 
				 	struct jobq_s *attached_queues_all[MAXNODES*32];
			
 
				 	/* the number of queues attached to each node */
			
--- a/src/datawizard/progress.c
+++ b/src/datawizard/progress.c
@@ -23,7 +23,7 @@ extern pthread_key_t local_workers_key;
 
				 
			
 
				 #ifdef USE_GORDON
			
 
				 extern void handle_terminated_job_per_worker(struct worker_s *worker);
			
 
				-extern struct starpu_mutex_t terminated_list_mutexes[32]; 
			
 
				+extern pthread_spinlock_t terminated_list_mutexes[32]; 
			
 
				 #endif
			
 
				 
			
 
				 void datawizard_progress(uint32_t memory_node)
			
@@ -40,9 +40,9 @@ void datawizard_progress(uint32_t memory_node)
 
				 		unsigned worker;
			
 
				 		for (worker = 0; worker < set->nworkers; worker++)
			
 
				 		{
			
 
				-			take_mutex(&terminated_list_mutexes[0]);
			
 
				+			pthread_spin_lock(&terminated_list_mutexes[0]);
			
 
				 			handle_terminated_job_per_worker(&set->workers[worker]);
			
 
				-			release_mutex(&terminated_list_mutexes[0]);
			
 
				+			pthread_spin_unlock(&terminated_list_mutexes[0]);
			
 
				 		}
			
 
				 	}
			
 
				 #endif
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -25,7 +25,7 @@ void write_through_data(data_state *state, uint32_t requesting_node,
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	while (take_mutex_try(&state->header_lock))
			
 
				+	while (pthread_spin_trylock(&state->header_lock))
			
 
				 		datawizard_progress(requesting_node);
			
 
				 
			
 
				 	/* first commit all changes onto the nodes specified by the mask */
			
@@ -59,7 +59,7 @@ void write_through_data(data_state *state, uint32_t requesting_node,
 
				 		state->per_node[requesting_node].state = SHARED;
			
 
				 	}
			
 
				 
			
 
				-	release_mutex(&state->header_lock);
			
 
				+	pthread_spin_unlock(&state->header_lock);
			
 
				 }
			
 
				 
			
 
				 void data_set_wb_mask(data_state *data, uint32_t wb_mask)
			
--- a/src/drivers/gordon/driver_gordon.c
+++ b/src/drivers/gordon/driver_gordon.c
@@ -31,7 +31,7 @@ pthread_t progress_thread;
 
				 pthread_cond_t progress_cond;
			
 
				 pthread_mutex_t progress_mutex;
			
 
				 
			
 
				-struct starpu_mutex_t terminated_list_mutexes[32]; 
			
 
				+pthread_spinlock_t terminated_list_mutexes[32]; 
			
 
				 
			
 
				 struct gordon_task_wrapper_s {
			
 
				 	/* who has executed that ? */
			
@@ -205,13 +205,13 @@ static void handle_terminated_jobs(struct worker_set_s *arg)
 
				 	unsigned spu;
			
 
				 	for (spu = 0; spu < arg->nworkers; spu++)
			
 
				 	{
			
 
				-		take_mutex(&terminated_list_mutexes[spu]);
			
 
				+		pthread_spin_lock(&terminated_list_mutexes[spu]);
			
 
				 		handle_terminated_job_per_worker(&arg->workers[spu]);
			
 
				-		release_mutex(&terminated_list_mutexes[spu]);
			
 
				-		//if (!take_mutex_try(&terminated_list_mutexes[spu]))
			
 
				+		pthread_spin_unlock(&terminated_list_mutexes[spu]);
			
 
				+		//if (!pthread_spin_trylock(&terminated_list_mutexes[spu]))
			
 
				 		//{
			
 
				 		//	handle_terminated_job_per_worker(&arg->workers[spu]);
			
 
				-		//	release_mutex(&terminated_list_mutexes[spu]);
			
 
				+		//	pthread_spin_unlock(&terminated_list_mutexes[spu]);
			
 
				 		//}
			
 
				 	}
			
 
				 }
			
@@ -237,7 +237,7 @@ static void gordon_callback_list_func(void *arg)
 
				 	unsigned task_cnt = 0;
			
 
				 
			
 
				 	/* XXX 0 was hardcoded */
			
 
				-	take_mutex(&terminated_list_mutexes[0]);
			
 
				+	pthread_spin_lock(&terminated_list_mutexes[0]);
			
 
				 	while (!job_list_empty(wrapper_list))
			
 
				 	{
			
 
				 		job_t j = job_list_pop_back(wrapper_list);
			
@@ -257,7 +257,7 @@ static void gordon_callback_list_func(void *arg)
 
				 	/* the job list was allocated by the gordon driver itself */
			
 
				 	job_list_delete(wrapper_list);
			
 
				 
			
 
				-	release_mutex(&terminated_list_mutexes[0]);
			
 
				+	pthread_spin_unlock(&terminated_list_mutexes[0]);
			
 
				 
			
 
				 	wake_all_blocked_workers();
			
 
				 	free(task_wrapper->gordon_job);
			
@@ -279,9 +279,9 @@ static void gordon_callback_func(void *arg)
 
				 //	fprintf(stderr, "gordon callback : push job j %p\n", task_wrapper->j);
			
 
				 
			
 
				 	/* XXX 0 was hardcoded */
			
 
				-	take_mutex(&terminated_list_mutexes[0]);
			
 
				+	pthread_spin_lock(&terminated_list_mutexes[0]);
			
 
				 	job_list_push_back(worker->terminated_jobs, task_wrapper->j);
			
 
				-	release_mutex(&terminated_list_mutexes[0]);
			
 
				+	pthread_spin_unlock(&terminated_list_mutexes[0]);
			
 
				 	wake_all_blocked_workers();
			
 
				 	free(task_wrapper);
			
 
				 }
			
@@ -465,7 +465,7 @@ void *gordon_worker(void *arg)
 
				 	unsigned spu;
			
 
				 	for (spu = 0; spu < gordon_set_arg->nworkers; spu++)
			
 
				 	{
			
 
				-		init_mutex(&terminated_list_mutexes[spu]);
			
 
				+		pthread_spin_init(&terminated_list_mutexes[spu], 0);
			
 
				 	}