|
@@ -1207,82 +1207,58 @@ void starpu_omp_taskgroup(void (*f)(void *arg), void *arg)
|
|
|
task->task_group = p_previous_task_group;
|
|
|
}
|
|
|
|
|
|
-void starpu_omp_for(void (*f)(unsigned long _first_i, unsigned long _nb_i, void *arg), void *arg, unsigned long nb_iterations, unsigned long chunk, int schedule, int ordered, int nowait)
|
|
|
+static inline void _starpu_omp_for_loop(struct starpu_omp_region *parallel_region, struct starpu_omp_task *task,
|
|
|
+ struct starpu_omp_loop *loop, int first_call,
|
|
|
+ unsigned long nb_iterations, unsigned long chunk, int schedule, unsigned long *_first_i, unsigned long *_nb_i)
|
|
|
{
|
|
|
- struct starpu_omp_task *task = STARPU_PTHREAD_GETSPECIFIC(omp_task_key);
|
|
|
- struct starpu_omp_region *parallel_region = task->owner_region;
|
|
|
- struct starpu_omp_loop *loop;
|
|
|
- if (ordered)
|
|
|
- /* TODO: implement ordered statement */
|
|
|
- _STARPU_ERROR("omp for / ordered not implemented\n");
|
|
|
- _starpu_spin_lock(¶llel_region->lock);
|
|
|
- loop = parallel_region->loop_list;
|
|
|
- while (loop && loop->id != task->loop_id)
|
|
|
- {
|
|
|
- loop = loop->next_loop;
|
|
|
- }
|
|
|
- if (!loop)
|
|
|
- {
|
|
|
- loop = malloc(sizeof(*loop));
|
|
|
- if (loop == NULL)
|
|
|
- _STARPU_ERROR("memory allocation failed\n");
|
|
|
- loop->id = task->loop_id;
|
|
|
- loop->next_iteration = 0;
|
|
|
- loop->nb_completed_threads = 0;
|
|
|
- loop->next_loop = parallel_region->loop_list;
|
|
|
- parallel_region->loop_list = loop;
|
|
|
- }
|
|
|
- _starpu_spin_unlock(¶llel_region->lock);
|
|
|
-
|
|
|
+ *_nb_i = 0;
|
|
|
if (schedule == starpu_omp_schedule_static || schedule == starpu_omp_schedule_auto)
|
|
|
{
|
|
|
if (chunk > 0)
|
|
|
{
|
|
|
- const unsigned long stride = parallel_region->nb_threads * chunk;
|
|
|
- unsigned long nb_strides = 0;
|
|
|
- unsigned long first_i = task->rank * chunk;
|
|
|
- unsigned long nb_i;
|
|
|
- while (first_i < nb_iterations)
|
|
|
+ if (first_call)
|
|
|
{
|
|
|
- if (first_i + chunk <= nb_iterations)
|
|
|
+ *_first_i = task->rank * chunk;
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ *_first_i += parallel_region->nb_threads * chunk;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (*_first_i < nb_iterations)
|
|
|
+ {
|
|
|
+ if (*_first_i + chunk > nb_iterations)
|
|
|
{
|
|
|
- nb_i = chunk;
|
|
|
+ *_nb_i = nb_iterations - *_first_i;
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
- nb_i = nb_iterations - first_i;
|
|
|
+ *_nb_i = chunk;
|
|
|
}
|
|
|
- f(first_i, nb_i, arg);
|
|
|
- nb_strides++;
|
|
|
- first_i = stride * nb_strides;
|
|
|
}
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
- unsigned long nb_i = nb_iterations / parallel_region->nb_threads;
|
|
|
- unsigned long first_i = (unsigned)task->rank * nb_i;
|
|
|
- unsigned long remainder = nb_iterations % parallel_region->nb_threads;
|
|
|
-
|
|
|
- if (nb_iterations % parallel_region->nb_threads > 0)
|
|
|
+ if (first_call)
|
|
|
{
|
|
|
- if ((unsigned)task->rank < remainder)
|
|
|
- {
|
|
|
- nb_i++;
|
|
|
- first_i += (unsigned)task->rank;
|
|
|
- }
|
|
|
- else
|
|
|
+ *_nb_i = nb_iterations / parallel_region->nb_threads;
|
|
|
+ *_first_i = (unsigned)task->rank * (*_nb_i);
|
|
|
+ unsigned long remainder = nb_iterations % parallel_region->nb_threads;
|
|
|
+
|
|
|
+ if (remainder > 0)
|
|
|
{
|
|
|
- first_i += remainder;
|
|
|
+ if ((unsigned)task->rank < remainder)
|
|
|
+ {
|
|
|
+ (*_nb_i)++;
|
|
|
+ *_first_i += (unsigned)task->rank;
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ *_first_i += remainder;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
- if (nb_i > 0)
|
|
|
- {
|
|
|
- f(first_i, nb_i, arg);
|
|
|
- }
|
|
|
}
|
|
|
-
|
|
|
- /* re-acquire lock before performing loop completion */
|
|
|
- _starpu_spin_lock(¶llel_region->lock);
|
|
|
}
|
|
|
else if (schedule == starpu_omp_schedule_dynamic)
|
|
|
{
|
|
@@ -1290,29 +1266,25 @@ void starpu_omp_for(void (*f)(unsigned long _first_i, unsigned long _nb_i, void
|
|
|
{
|
|
|
chunk = 1;
|
|
|
}
|
|
|
- for (;;)
|
|
|
+ if (first_call)
|
|
|
+ {
|
|
|
+ *_first_i = 0;
|
|
|
+ }
|
|
|
+ _starpu_spin_lock(¶llel_region->lock);
|
|
|
+ if (loop->next_iteration < nb_iterations)
|
|
|
{
|
|
|
- unsigned long first_i;
|
|
|
- unsigned long nb_i;
|
|
|
-
|
|
|
- _starpu_spin_lock(¶llel_region->lock);
|
|
|
- /* upon exiting the loop, the parallel_region-lock will already be held
|
|
|
- * for performing loop completion */
|
|
|
- if (loop->next_iteration >= nb_iterations)
|
|
|
- break;
|
|
|
- first_i = loop->next_iteration;
|
|
|
- if (first_i + chunk > nb_iterations)
|
|
|
+ *_first_i = loop->next_iteration;
|
|
|
+ if (*_first_i + chunk > nb_iterations)
|
|
|
{
|
|
|
- nb_i = nb_iterations - first_i;
|
|
|
+ *_nb_i = nb_iterations - *_first_i;
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
- nb_i = chunk;
|
|
|
+ *_nb_i = chunk;
|
|
|
}
|
|
|
- loop->next_iteration += nb_i;
|
|
|
- _starpu_spin_unlock(¶llel_region->lock);
|
|
|
- f(first_i, nb_i, arg);
|
|
|
+ loop->next_iteration += *_nb_i;
|
|
|
}
|
|
|
+ _starpu_spin_unlock(¶llel_region->lock);
|
|
|
}
|
|
|
else if (schedule == starpu_omp_schedule_guided)
|
|
|
{
|
|
@@ -1320,35 +1292,63 @@ void starpu_omp_for(void (*f)(unsigned long _first_i, unsigned long _nb_i, void
|
|
|
{
|
|
|
chunk = 1;
|
|
|
}
|
|
|
- for (;;)
|
|
|
+ if (first_call)
|
|
|
{
|
|
|
- unsigned long first_i;
|
|
|
- unsigned long nb_i;
|
|
|
-
|
|
|
- _starpu_spin_lock(¶llel_region->lock);
|
|
|
- /* upon exiting the loop, the parallel_region-lock will already be held
|
|
|
- * for performing loop completion */
|
|
|
- if (loop->next_iteration >= nb_iterations)
|
|
|
- break;
|
|
|
- first_i = loop->next_iteration;
|
|
|
- nb_i = (nb_iterations - first_i)/parallel_region->nb_threads;
|
|
|
- if (nb_i < chunk)
|
|
|
+ *_first_i = 0;
|
|
|
+ }
|
|
|
+ _starpu_spin_lock(¶llel_region->lock);
|
|
|
+ if (loop->next_iteration < nb_iterations)
|
|
|
+ {
|
|
|
+ *_first_i = loop->next_iteration;
|
|
|
+ *_nb_i = (nb_iterations - *_first_i)/parallel_region->nb_threads;
|
|
|
+ if (*_nb_i < chunk)
|
|
|
{
|
|
|
- if (first_i+chunk > nb_iterations)
|
|
|
+ if (*_first_i+chunk > nb_iterations)
|
|
|
{
|
|
|
- nb_i = nb_iterations - first_i;
|
|
|
+ *_nb_i = nb_iterations - *_first_i;
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
- nb_i = chunk;
|
|
|
+ *_nb_i = chunk;
|
|
|
}
|
|
|
}
|
|
|
- loop->next_iteration += nb_i;
|
|
|
- _starpu_spin_unlock(¶llel_region->lock);
|
|
|
- f(first_i, nb_i, arg);
|
|
|
+ loop->next_iteration += *_nb_i;
|
|
|
}
|
|
|
+ _starpu_spin_unlock(¶llel_region->lock);
|
|
|
}
|
|
|
+}
|
|
|
|
|
|
+static inline struct starpu_omp_loop *_starpu_omp_for_loop_begin(struct starpu_omp_region *parallel_region, struct starpu_omp_task *task,
|
|
|
+ int ordered)
|
|
|
+{
|
|
|
+ struct starpu_omp_loop *loop;
|
|
|
+ if (ordered)
|
|
|
+ /* TODO: implement ordered statement */
|
|
|
+ _STARPU_ERROR("omp for / ordered not implemented\n");
|
|
|
+ _starpu_spin_lock(¶llel_region->lock);
|
|
|
+ loop = parallel_region->loop_list;
|
|
|
+ while (loop && loop->id != task->loop_id)
|
|
|
+ {
|
|
|
+ loop = loop->next_loop;
|
|
|
+ }
|
|
|
+ if (!loop)
|
|
|
+ {
|
|
|
+ loop = malloc(sizeof(*loop));
|
|
|
+ if (loop == NULL)
|
|
|
+ _STARPU_ERROR("memory allocation failed\n");
|
|
|
+ loop->id = task->loop_id;
|
|
|
+ loop->next_iteration = 0;
|
|
|
+ loop->nb_completed_threads = 0;
|
|
|
+ loop->next_loop = parallel_region->loop_list;
|
|
|
+ parallel_region->loop_list = loop;
|
|
|
+ }
|
|
|
+ _starpu_spin_unlock(¶llel_region->lock);
|
|
|
+ return loop;
|
|
|
+}
|
|
|
+static inline void _starpu_omp_for_loop_end(struct starpu_omp_region *parallel_region, struct starpu_omp_task *task,
|
|
|
+ struct starpu_omp_loop *loop)
|
|
|
+{
|
|
|
+ _starpu_spin_lock(¶llel_region->lock);
|
|
|
loop->nb_completed_threads++;
|
|
|
if (loop->nb_completed_threads == parallel_region->nb_threads)
|
|
|
{
|
|
@@ -1363,12 +1363,53 @@ void starpu_omp_for(void (*f)(unsigned long _first_i, unsigned long _nb_i, void
|
|
|
free(loop);
|
|
|
}
|
|
|
_starpu_spin_unlock(¶llel_region->lock);
|
|
|
+ task->loop_id++;
|
|
|
+}
|
|
|
|
|
|
+int starpu_omp_for_inline_first(unsigned long nb_iterations, unsigned long chunk, int schedule, int ordered, unsigned long *_first_i, unsigned long *_nb_i)
|
|
|
+{
|
|
|
+ struct starpu_omp_task *task = STARPU_PTHREAD_GETSPECIFIC(omp_task_key);
|
|
|
+ struct starpu_omp_region *parallel_region = task->owner_region;
|
|
|
+ struct starpu_omp_loop *loop = _starpu_omp_for_loop_begin(parallel_region, task, ordered);
|
|
|
+
|
|
|
+ _starpu_omp_for_loop(parallel_region, task, loop, 1, nb_iterations, chunk, schedule, _first_i, _nb_i);
|
|
|
+ if (*_nb_i == 0)
|
|
|
+ {
|
|
|
+ _starpu_omp_for_loop_end(parallel_region, task, loop);
|
|
|
+ }
|
|
|
+ return (*_nb_i != 0);
|
|
|
+}
|
|
|
+
|
|
|
+int starpu_omp_for_inline_next(unsigned long nb_iterations, unsigned long chunk, int schedule, int ordered, unsigned long *_first_i, unsigned long *_nb_i)
|
|
|
+{
|
|
|
+ struct starpu_omp_task *task = STARPU_PTHREAD_GETSPECIFIC(omp_task_key);
|
|
|
+ struct starpu_omp_region *parallel_region = task->owner_region;
|
|
|
+ struct starpu_omp_loop *loop = _starpu_omp_for_loop_begin(parallel_region, task, ordered);
|
|
|
+
|
|
|
+ _starpu_omp_for_loop(parallel_region, task, loop, 0, nb_iterations, chunk, schedule, _first_i, _nb_i);
|
|
|
+ if (*_nb_i == 0)
|
|
|
+ {
|
|
|
+ _starpu_omp_for_loop_end(parallel_region, task, loop);
|
|
|
+ }
|
|
|
+ return (*_nb_i != 0);
|
|
|
+}
|
|
|
+
|
|
|
+void starpu_omp_for(void (*f)(unsigned long _first_i, unsigned long _nb_i, void *arg), void *arg, unsigned long nb_iterations, unsigned long chunk, int schedule, int ordered, int nowait)
|
|
|
+{
|
|
|
+ unsigned long _first_i = 0;
|
|
|
+ unsigned long _nb_i = 0;
|
|
|
+ if (starpu_omp_for_inline_first(nb_iterations, chunk, schedule, ordered, &_first_i, &_nb_i))
|
|
|
+ {
|
|
|
+ do
|
|
|
+ {
|
|
|
+ f(_first_i, _nb_i, arg);
|
|
|
+ }
|
|
|
+ while (starpu_omp_for_inline_next(nb_iterations, chunk, schedule, ordered, &_first_i, &_nb_i));
|
|
|
+ }
|
|
|
if (!nowait)
|
|
|
{
|
|
|
starpu_omp_barrier();
|
|
|
}
|
|
|
- task->loop_id++;
|
|
|
}
|
|
|
|
|
|
/*
|