Browse Source

merge trunk

Corentin Salingue 12 years ago
parent
commit
e75bbec0cf

+ 2 - 0
ChangeLog

@@ -152,6 +152,8 @@ Small features:
   * New function starpu_get_version() to return as 3 integers the
   * New function starpu_get_version() to return as 3 integers the
     release version of StarPU.
     release version of StarPU.
   * Enable by default data allocation cache
   * Enable by default data allocation cache
+  * Explicitly name the non-sleeping-non-running time "Overhead", and use
+    another color in vite traces.
 
 
 Changes:
 Changes:
   * Rename all filter functions to follow the pattern
   * Rename all filter functions to follow the pattern

+ 1 - 1
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -325,7 +325,7 @@ purposes.
 This field has been made deprecated. One should use instead the
 This field has been made deprecated. One should use instead the
 field starpu_task::handles to specify the data handles accessed
 field starpu_task::handles to specify the data handles accessed
 by the task. The access modes are now defined in the field
 by the task. The access modes are now defined in the field
-starpu_codelet::mode.
+starpu_codelet::modes.
 \var starpu_task::handles
 \var starpu_task::handles
 Is an array of ::starpu_data_handle_t. It specifies the handles to the
 Is an array of ::starpu_data_handle_t. It specifies the handles to the
 different pieces of data accessed by the task. The number of entries
 different pieces of data accessed by the task. The number of entries

+ 14 - 0
doc/doxygen/chapters/tips_and_tricks.doxy

@@ -95,4 +95,18 @@ Or add the following line in the file <c>/etc/sysctl.conf</c>
 security.models.extensions.user_set_cpu_affinity=1
 security.models.extensions.user_set_cpu_affinity=1
 \endverbatim
 \endverbatim
 
 
+\section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
+
+Some users had issues with MKL 11 and StarPU (versions 1.1rc1 and
+1.0.5) on Linux with MKL, using 1 thread for MKL and doing all the
+parallelism using StarPU (no multithreaded tasks), setting the
+environment variable MKL_NUM_THREADS to 1, and using the threaded MKL library,
+with iomp5.
+
+Using this configuration, StarPU uses only 1 core, no matter the value of
+\ref STARPU_NCPU. The problem is actually a thread pinning issue with MKL.
+
+The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
+(http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
+
 */
 */

+ 1 - 7
src/core/dependencies/implicit_data_deps.c

@@ -481,12 +481,8 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	unsigned do_submit_tasks = 0;
 	unsigned do_submit_tasks = 0;
 
 
-	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
-
-	if (handle->sequential_consistency)
+	if (handle->post_sync_tasks_cnt > 0)
 	{
 	{
-		STARPU_ASSERT(handle->post_sync_tasks_cnt > 0);
-
 		if (--handle->post_sync_tasks_cnt == 0)
 		if (--handle->post_sync_tasks_cnt == 0)
 		{
 		{
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
@@ -496,8 +492,6 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 		}
 		}
 	}
 	}
 
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
-
 	if (do_submit_tasks)
 	if (do_submit_tasks)
 	{
 	{
 		struct _starpu_task_wrapper_list *link = post_sync_tasks;
 		struct _starpu_task_wrapper_list *link = post_sync_tasks;

+ 6 - 1
src/datawizard/memalloc.c

@@ -926,6 +926,7 @@ unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, unsi
 	return handle->per_node[memory_node].allocated;
 	return handle->per_node[memory_node].allocated;
 }
 }
 
 
+/* Record that this memchunk has been recently used */
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
 {
 {
 	_starpu_spin_lock(&lru_rwlock[node]);
 	_starpu_spin_lock(&lru_rwlock[node]);
@@ -935,10 +936,11 @@ void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
 	_starpu_spin_unlock(&lru_rwlock[node]);
 	_starpu_spin_unlock(&lru_rwlock[node]);
 }
 }
 
 
+/* Push the given memchunk, recently used, at the end of the chunks to be evicted */
 /* The mc_rwlock[node] rw-lock should be taken prior to calling this function.*/
 /* The mc_rwlock[node] rw-lock should be taken prior to calling this function.*/
 static void _starpu_memchunk_recently_used_move(struct _starpu_mem_chunk *mc, unsigned node)
 static void _starpu_memchunk_recently_used_move(struct _starpu_mem_chunk *mc, unsigned node)
 {
 {
-	/* XXX Sometimes the memchunk is not in the list... */
+	/* Note: Sometimes the memchunk is not in the list... */
 	struct _starpu_mem_chunk *mc_iter;
 	struct _starpu_mem_chunk *mc_iter;
 	for (mc_iter = _starpu_mem_chunk_list_begin(mc_list[node]);
 	for (mc_iter = _starpu_mem_chunk_list_begin(mc_list[node]);
 	     mc_iter != _starpu_mem_chunk_list_end(mc_list[node]);
 	     mc_iter != _starpu_mem_chunk_list_end(mc_list[node]);
@@ -954,6 +956,9 @@ static void _starpu_memchunk_recently_used_move(struct _starpu_mem_chunk *mc, un
 	}
 	}
 }
 }
 
 
+/* Put the recently used memchunks at the end of the mc_list, in the same order
+ * as the LRU list, so that the most recently used memchunk eventually comes
+ * last in the mc_list */
 static void starpu_lru(unsigned node)
 static void starpu_lru(unsigned node)
 {
 {
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);

+ 4 - 1
src/datawizard/user_interactions.c

@@ -75,7 +75,8 @@ static void _starpu_data_acquire_fetch_data_callback(void *arg)
 	 * We enqueue the "post" sync task in the list associated to the handle
 	 * We enqueue the "post" sync task in the list associated to the handle
 	 * so that it is submitted by the starpu_data_release
 	 * so that it is submitted by the starpu_data_release
 	 * function. */
 	 * function. */
-	_starpu_add_post_sync_tasks(wrapper->post_sync_task, handle);
+	if (wrapper->post_sync_task)
+		_starpu_add_post_sync_tasks(wrapper->post_sync_task, handle);
 
 
 	wrapper->callback(wrapper->callback_arg);
 	wrapper->callback(wrapper->callback_arg);
 
 
@@ -132,6 +133,8 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
 	STARPU_PTHREAD_COND_INIT(&wrapper->cond, NULL);
 	STARPU_PTHREAD_COND_INIT(&wrapper->cond, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
 	wrapper->finished = 0;
 	wrapper->finished = 0;
+	wrapper->pre_sync_task = NULL;
+	wrapper->post_sync_task = NULL;
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	int sequential_consistency = handle->sequential_consistency;
 	int sequential_consistency = handle->sequential_consistency;

+ 4 - 4
src/debug/traces/starpu_paje.c

@@ -160,7 +160,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	poti_DefineEntityValue("Fi", "S", "FetchingInput", "1.0 .1 1.0");
 	poti_DefineEntityValue("Fi", "S", "FetchingInput", "1.0 .1 1.0");
 	poti_DefineEntityValue("Po", "S", "PushingOutput", "0.1 1.0 1.0");
 	poti_DefineEntityValue("Po", "S", "PushingOutput", "0.1 1.0 1.0");
 	poti_DefineEntityValue("C", "S", "Callback", ".0 .3 .8");
 	poti_DefineEntityValue("C", "S", "Callback", ".0 .3 .8");
-	poti_DefineEntityValue("B", "S", "Blocked", ".9 .1 .0");
+	poti_DefineEntityValue("B", "S", "Overhead", ".5 .18 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
 
 
@@ -187,7 +187,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 		poti_DefineEntityValue("Fi", ctx, "FetchingInput", "1.0 .1 1.0");
 		poti_DefineEntityValue("Fi", ctx, "FetchingInput", "1.0 .1 1.0");
 		poti_DefineEntityValue("Po", ctx, "PushingOutput", "0.1 1.0 1.0");
 		poti_DefineEntityValue("Po", ctx, "PushingOutput", "0.1 1.0 1.0");
 		poti_DefineEntityValue("C", ctx, "Callback", ".0 .3 .8");
 		poti_DefineEntityValue("C", ctx, "Callback", ".0 .3 .8");
-		poti_DefineEntityValue("B", ctx, "Blocked", ".9 .1 .0");
+		poti_DefineEntityValue("B", ctx, "Overhead", ".5 .18 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
 	}
 	}
@@ -226,7 +226,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       Fi       S      FetchingInput       \"1.0 .1 1.0\"            \n\
 6       Fi       S      FetchingInput       \"1.0 .1 1.0\"            \n\
 6       Po       S      PushingOutput       \"0.1 1.0 1.0\"            \n\
 6       Po       S      PushingOutput       \"0.1 1.0 1.0\"            \n\
 6       C       S       Callback       \".0 .3 .8\"            \n\
 6       C       S       Callback       \".0 .3 .8\"            \n\
-6       B       S       Blocked         \".9 .1 .0\"		\n\
+6       B       S       Overhead         \".5 .18 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
 6       P       S       Progressing         \".4 .1 .6\"		\n");
 6       P       S       Progressing         \".4 .1 .6\"		\n");
 	fprintf(file, "\
 	fprintf(file, "\
@@ -245,7 +245,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       Fi       Ctx%u      FetchingInput       \"1.0 .1 1.0\"            \n\
 6       Fi       Ctx%u      FetchingInput       \"1.0 .1 1.0\"            \n\
 6       Po       Ctx%u      PushingOutput       \"0.1 1.0 1.0\"            \n\
 6       Po       Ctx%u      PushingOutput       \"0.1 1.0 1.0\"            \n\
 6       C       Ctx%u       Callback       \".0 .3 .8\"            \n\
 6       C       Ctx%u       Callback       \".0 .3 .8\"            \n\
-6       B       Ctx%u       Blocked         \".9 .1 .0\"		\n\
+6       B       Ctx%u       Overhead         \".5 .18 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
 6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n",
 6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n",
 		i, i, i, i, i, i, i, i);
 		i, i, i, i, i, i, i, i);

+ 2 - 2
tests/disk/disk_compute.c

@@ -23,7 +23,7 @@
 #include <stdio.h>
 #include <stdio.h>
 #include <math.h>
 #include <math.h>
 
 
-#define NX (30*1000000/sizeof(int))
+#define NX (30*1000000)
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
@@ -66,7 +66,7 @@ int main(int argc, char **argv)
 
 
 	/* And now, you want to use your datas in StarPU */
 	/* And now, you want to use your datas in StarPU */
 	/* Open the file ON the disk */
 	/* Open the file ON the disk */
-	void * data = starpu_disk_open(dd, (void *) "STARPU_DISK_COMPUTE_DATA", NX*sizeof(int));
+	void * data = starpu_disk_open(dd, (void *) "STARPU_DISK_COMPUTE_DATA", NX);
 
 
 	starpu_data_handle_t vector_handleA, vector_handleB;
 	starpu_data_handle_t vector_handleA, vector_handleB;