10 lat temu · 51d9fdc477
--- a/ChangeLog
+++ b/ChangeLog
@@ -156,6 +156,17 @@ Small changes:
 
																   * Rename function starpu_trace_user_event() as
															
 
																     starpu_fxt_trace_user_event()
															
 
																+StarPU 1.1.5 (svn revision xxx)
															
 
																+==============================================
															
 
																+The scheduling context release
															
 
																+
															
 
																+  * Add starpu_memory_pin and starpu_memory_unpin to pin memory allocated
															
 
																+    another way than starpu_malloc.
															
 
																+  * Add starpu_task_wait_for_n_submitted() and
															
 
																+    STARPU_LIMIT_MAX_NSUBMITTED_TASKS/STARPU_LIMIT_MIN_NSUBMITTED_TASKS to
															
 
																+    easily control the number of submitted tasks by making task submission
															
 
																+    block.
															
 
																+
															
 
																 StarPU 1.1.4 (svn revision 14856)
															
 
																 ==============================================
															
 
																 The scheduling context release
															
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -141,6 +141,30 @@ execution. For example, in the Cholesky factorization (dense linear algebra
 
																 application), the GEMM task uses up to 3 buffers, so it is possible to set the
															
 
																 maximum number of task buffers to 3 to run a Cholesky factorization on StarPU.
															
 
																+\section HowtoReuseMemory How to reuse memory
															
 
																+
															
 
																+When your application needs to allocate more data than the available amount of
															
 
																+memory usable by StarPU (given by \ref starpu_memory_get_available() ), the
															
 
																+allocation cache system can reuse data buffers used by previously executed
															
 
																+tasks. For that system to work with MPI tasks, you need to submit tasks progressively instead
															
 
																+of as soon as possible, because in the case of MPI receives, the allocation cache check for reusing data
															
 
																+buffers will be done at submission time, not at execution time.
															
 
																+
															
 
																+You have two options to control the task submission flow. The first one is by
															
 
																+controlling the number of submitted tasks during the whole execution. This can
															
 
																+be done whether by setting the environment variables \ref
															
 
																+STARPU_LIMIT_MAX_NSUBMITTED_TASKS and \ref STARPU_LIMIT_MIN_NSUBMITTED_TASKS to
															
 
																+tell StarPU when to stop submitting tasks and when to wake up and submit tasks
															
 
																+again, or by explicitely calling \ref starpu_task_wait_for_n_submitted() in
															
 
																+your application code for finest grain control (for example, between two
															
 
																+iterations of a submission loop).
															
 
																+
															
 
																+The second option is to control the memory size of the allocation cache. This
															
 
																+can be done in the application by using jointly \ref
															
 
																+starpu_memory_get_available() and \ref starpu_memory_wait_available() to submit
															
 
																+tasks only when there is enough memory space to allocate the data needed by the
															
 
																+task, i.e when enough data are available for reuse in the allocation cache.
															
 
																+
															
 
																 \section PerformanceModelCalibration Performance Model Calibration
															
 
																 Most schedulers are based on an estimation of codelet duration on each kind
															
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
@@ -665,6 +665,28 @@ This specifies then size to be used by StarPU to push data when the main
 
																 memory is getting full. The default is unlimited.
															
 
																 </dd>
															
 
																+<dt>STARPU_LIMIT_MAX_NSUBMITTED_TASKS</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_LIMIT_MAX_NSUBMITTED_TASKS
															
 
																+\addindex __env__STARPU_LIMIT_MAX_NSUBMITTED_TASKS    
															
 
																+This variable allows the user to control the task submission flow by specifying
															
 
																+to StarPU a maximum number of submitted tasks allowed at a given time, i.e. when
															
 
																+this limit is reached task submission becomes blocking until enough tasks have
															
 
																+completed, specified by STARPU_LIMIT_MIN_NSUBMITTED_TASKS.
															
 
																+Setting it enables allocation cache buffer reuse in main memory.
															
 
																+</dd>
															
 
																+
															
 
																+<dt>STARPU_LIMIT_MIN_NSUBMITTED_TASKS</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_LIMIT_MIN_NSUBMITTED_TASKS
															
 
																+\addindex __env__STARPU_LIMIT_MIN_NSUBMITTED_TASKS    
															
 
																+This variable allows the user to control the task submission flow by specifying
															
 
																+to StarPU a submitted task threshold to wait before unblocking task submission. This
															
 
																+variable has to be used in conjunction with \ref
															
 
																+STARPU_LIMIT_MAX_NSUBMITTED_TASKS which puts the task submission thread to
															
 
																+sleep.  Setting it enables allocation cache buffer reuse in main memory.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>STARPU_TRACE_BUFFER_SIZE</dt>
															
 
																 <dd>
															
 
																 \anchor STARPU_TRACE_BUFFER_SIZE
															
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -796,6 +796,17 @@ terminated. It does not destroy these tasks.
 
																 This function waits until all the tasks that were already
															
 
																 submitted to the context \p sched_ctx_id have been executed
															
 
																+\fn int starpu_task_wait_for_n_submitted(unsigned n)
															
 
																+\ingroup API_Codelet_And_Tasks
															
 
																+This function blocks until there are <c> n </c> submitted tasks left (to the
															
 
																+current context or the global one if there aren't any) to be executed. It does
															
 
																+not destroy these tasks.
															
 
																+
															
 
																+\fn int starpu_task_wait_for_n_submitted_in_ctx(unsigned sched_ctx, unsigned n) 
															
 
																+\ingroup API_Codelet_And_Tasks
															
 
																+This function waits until there are <c> n </c> tasks submitted left to be
															
 
																+executed that were already submitted to the context <c> sched_ctx_id </c>.
															
 
																+
															
 
																 \fn int starpu_task_nready(void)
															
 
																 \ingroup API_Codelet_And_Tasks
															
 
																 TODO
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -286,8 +286,10 @@ int starpu_task_finished(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 
																 int starpu_task_wait(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
															
 
																 int starpu_task_wait_for_all(void);
															
 
																+int starpu_task_wait_for_n_submitted(unsigned n);
															
 
																 int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx_id);
															
 
																+int starpu_task_wait_for_n_submitted_in_ctx(unsigned sched_ctx_id, unsigned n);
															
 
																 int starpu_task_wait_for_no_ready(void);
															
--- a/src/common/barrier.h
+++ b/src/common/barrier.h
@@ -21,9 +21,9 @@
 
																 struct _starpu_barrier
															
 
																 {
															
 
																-	int count;
															
 
																-	int reached_start;
															
 
																-	int reached_exit;
															
 
																+	unsigned count;
															
 
																+	unsigned reached_start;
															
 
																+	unsigned reached_exit;
															
 
																 	double reached_flops;
															
 
																 	starpu_pthread_mutex_t mutex;
															
 
																 	starpu_pthread_mutex_t mutex_exit;
															
--- a/src/common/barrier_counter.c
+++ b/src/common/barrier_counter.c
@@ -16,7 +16,7 @@
 
																 #include <common/barrier_counter.h>
															
 
																-int _starpu_barrier_counter_init(struct _starpu_barrier_counter *barrier_c, int count)
															
 
																+int _starpu_barrier_counter_init(struct _starpu_barrier_counter *barrier_c, unsigned count)
															
 
																 {
															
 
																 	_starpu_barrier_init(&barrier_c->barrier, count);
															
 
																 	STARPU_PTHREAD_COND_INIT(&barrier_c->cond2, NULL);
															
@@ -43,6 +43,18 @@ int _starpu_barrier_counter_wait_for_empty_counter(struct _starpu_barrier_counte
 
																 	return 0;
															
 
																 }
															
 
																+int _starpu_barrier_counter_wait_until_counter_reaches_n(struct _starpu_barrier_counter *barrier_c, unsigned n)
															
 
																+{
															
 
																+	struct _starpu_barrier *barrier = &barrier_c->barrier;
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
															
 
																+
															
 
																+	while (barrier->reached_start > n)
															
 
																+		STARPU_PTHREAD_COND_WAIT(&barrier->cond, &barrier->mutex);
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																 int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter *barrier_c)
															
 
																 {
															
 
																 	struct _starpu_barrier *barrier = &barrier_c->barrier;
															
--- a/src/common/barrier_counter.h
+++ b/src/common/barrier_counter.h
@@ -26,12 +26,14 @@ struct _starpu_barrier_counter
 
																 	starpu_pthread_cond_t cond2;
															
 
																 };
															
 
																-int _starpu_barrier_counter_init(struct _starpu_barrier_counter *barrier_c, int count);
															
 
																+int _starpu_barrier_counter_init(struct _starpu_barrier_counter *barrier_c, unsigned count);
															
 
																 int _starpu_barrier_counter_destroy(struct _starpu_barrier_counter *barrier_c);
															
 
																 int _starpu_barrier_counter_wait_for_empty_counter(struct _starpu_barrier_counter *barrier_c);
															
 
																+int _starpu_barrier_counter_wait_until_counter_reaches_n(struct _starpu_barrier_counter *barrier_c, unsigned n);
															
 
																+
															
 
																 int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter *barrier_c);
															
 
																 int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c, double flops);
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -1176,6 +1176,15 @@ int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
																 	return _starpu_barrier_counter_wait_for_empty_counter(&sched_ctx->tasks_barrier);
															
 
																 }
															
 
																+int _starpu_wait_for_n_submitted_tasks_of_sched_ctx(unsigned sched_ctx_id, unsigned n)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+
															
 
																+	STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_task_wait_for_n_submitted_tasks must not be called from a task or callback");
															
 
																+
															
 
																+	return _starpu_barrier_counter_wait_until_counter_reaches_n(&sched_ctx->tasks_barrier, n);
															
 
																+}
															
 
																+
															
 
																 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
															
 
																 {
															
 
																 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -182,6 +182,9 @@ void _starpu_delete_all_sched_ctxs();
 
																  * context have been executed. */
															
 
																 int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id);
															
 
																+/* This function waits until at most n tasks are still submitted. */
															
 
																+int _starpu_wait_for_n_submitted_tasks_of_sched_ctx(unsigned sched_ctx_id, unsigned n);
															
 
																+
															
 
																 /* In order to implement starpu_wait_for_all_tasks_of_ctx, we keep track of the number of
															
 
																  * task currently submitted to the context */
															
 
																 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -604,6 +604,16 @@ int starpu_task_submit(struct starpu_task *task)
 
																 #endif
															
 
																 		;
															
 
																+	if (!j->internal)
															
 
																+	{
															
 
																+		int limit_min_submitted_tasks = starpu_get_env_number("STARPU_LIMIT_MIN_SUBMITTED_TASKS");
															
 
																+		int limit_max_submitted_tasks = starpu_get_env_number("STARPU_LIMIT_MAX_SUBMITTED_TASKS");
															
 
																+		int nsubmitted_tasks = starpu_task_nsubmitted();
															
 
																+		if (limit_max_submitted_tasks >= 0 && limit_max_submitted_tasks > nsubmitted_tasks
															
 
																+			&& limit_min_submitted_tasks >= 0 && limit_min_submitted_tasks < nsubmitted_tasks)
															
 
																+			starpu_task_wait_for_n_submitted(limit_min_submitted_tasks);
															
 
																+	}
															
 
																+
															
 
																 	ret = _starpu_task_submit_head(task);
															
 
																 	if (ret)
															
@@ -834,6 +844,55 @@ int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
 
																 #endif
															
 
																 	return 0;
															
 
																 }
															
 
																+
															
 
																+/*
															
 
																+ * We wait until there's a certain number of the tasks that have already been
															
 
																+ * submitted left. Note that a regenerable is not considered finished until it
															
 
																+ * was explicitely set as non-regenerale anymore (eg. from a callback).
															
 
																+ */
															
 
																+int starpu_task_wait_for_n_submitted(unsigned n)
															
 
																+{
															
 
																+	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
															
 
																+	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_context();
															
 
																+
															
 
																+	/* if there is no indication about which context to wait,
															
 
																+	   we wait for all tasks submitted to starpu */
															
 
																+	if (sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
															
 
																+	{
															
 
																+		_STARPU_DEBUG("Waiting for all tasks\n");
															
 
																+		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_task_wait_for_n_submitted must not be called from a task or callback");
															
 
																+
															
 
																+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
 
																+		if(config->topology.nsched_ctxs == 1)
															
 
																+			_starpu_wait_for_n_submitted_tasks_of_sched_ctx(0, n);
															
 
																+		else
															
 
																+		{
															
 
																+			int s;
															
 
																+			for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
															
 
																+			{
															
 
																+				if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
															
 
																+				{
															
 
																+					_starpu_wait_for_n_submitted_tasks_of_sched_ctx(config->sched_ctxs[s].id, n);
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		return 0;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		_STARPU_DEBUG("Waiting for tasks submitted to context %u\n", sched_ctx_id);
															
 
																+		_starpu_wait_for_n_submitted_tasks_of_sched_ctx(sched_ctx_id, n);
															
 
																+	}
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_task_wait_for_n_submitted_in_ctx(unsigned sched_ctx, unsigned n)
															
 
																+{
															
 
																+	_starpu_wait_for_n_submitted_tasks_of_sched_ctx(sched_ctx, n);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																 /*
															
 
																  * We wait until there is no ready task any more (i.e. StarPU will not be able
															
 
																  * to progress any more).