瀏覽代碼

fix case where the number of subcontexts does not divide the number of cpu workers, in which case some cpu workers were not assigned to the subcontexts, and would end-up executing parallel tasks without the prologue being executed first

Olivier Aumage 8 年之前
父節點
當前提交
a110078dab
共有 1 個文件被更改,包括 14 次插入10 次删除
  1. 14 10
      examples/sched_ctx/parallel_tasks_reuse_handle.c

+ 14 - 10
examples/sched_ctx/parallel_tasks_reuse_handle.c

@@ -17,6 +17,7 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <omp.h>
 #include <omp.h>
+#include <pthread.h>
 
 
 #ifdef STARPU_QUICK_CHECK
 #ifdef STARPU_QUICK_CHECK
 #define NTASKS 64
 #define NTASKS 64
@@ -28,6 +29,8 @@
 #define LOOPS  10
 #define LOOPS  10
 #endif
 #endif
 
 
+#define N_NESTED_CTXS 2
+
 struct context
 struct context
 {
 {
 	int ncpus;
 	int ncpus;
@@ -38,6 +41,7 @@ struct context
 /* Helper for the task that will initiate everything */
 /* Helper for the task that will initiate everything */
 void parallel_task_prologue_init_once_and_for_all(void * sched_ctx_)
 void parallel_task_prologue_init_once_and_for_all(void * sched_ctx_)
 {
 {
+	fprintf(stderr, "%p: %s -->\n", (void*)pthread_self(), __func__);
 	int sched_ctx = *(int *)sched_ctx_;
 	int sched_ctx = *(int *)sched_ctx_;
 	int *cpuids = NULL;
 	int *cpuids = NULL;
 	int ncpuids = 0;
 	int ncpuids = 0;
@@ -50,6 +54,7 @@ void parallel_task_prologue_init_once_and_for_all(void * sched_ctx_)
 
 
 	omp_set_num_threads(ncpuids);
 	omp_set_num_threads(ncpuids);
 	free(cpuids);
 	free(cpuids);
+	fprintf(stderr, "%p: %s <--\n", (void*)pthread_self(), __func__);
 	return;
 	return;
 }
 }
 
 
@@ -101,25 +106,24 @@ void parallel_task_init()
 						  0);
 						  0);
 
 
 	/* Initialize nested contexts */
 	/* Initialize nested contexts */
-	/* WARNING : the number of contexts must be a divisor of the number of available cpus*/
+	contexts = malloc(sizeof(struct context)*N_NESTED_CTXS);
-
+	int cpus_per_context = main_context.ncpus/N_NESTED_CTXS;
-	contexts = malloc(sizeof(struct context)*2);
-	int cpus_per_context = main_context.ncpus/2;
 	int i;
 	int i;
-	for(i = 0; i < 2; i++)
+	for(i = 0; i < N_NESTED_CTXS; i++)
 	{
 	{
-		fprintf(stderr, "ncpus %d for context %d \n",cpus_per_context, i);
 		contexts[i].ncpus = cpus_per_context;
 		contexts[i].ncpus = cpus_per_context;
+		if (i == N_NESTED_CTXS-1)
+			contexts[i].ncpus += main_context.ncpus%N_NESTED_CTXS;
 		contexts[i].cpus = main_context.cpus+i*cpus_per_context;
 		contexts[i].cpus = main_context.cpus+i*cpus_per_context;
 	}
 	}
 
 
-	for(i = 0; i < 2; i++)
+	for(i = 0; i < N_NESTED_CTXS; i++)
 		contexts[i].id = starpu_sched_ctx_create(contexts[i].cpus,
 		contexts[i].id = starpu_sched_ctx_create(contexts[i].cpus,
 							 contexts[i].ncpus,"nested_ctx",
 							 contexts[i].ncpus,"nested_ctx",
 							 STARPU_SCHED_CTX_NESTED,main_context.id,
 							 STARPU_SCHED_CTX_NESTED,main_context.id,
 							 0);
 							 0);
 
 
-	for (i = 0; i < 2; i++)
+	for (i = 0; i < N_NESTED_CTXS; i++)
 	{
 	{
 		parallel_task_init_one_context(&contexts[i].id);
 		parallel_task_init_one_context(&contexts[i].id);
 	}
 	}
@@ -131,7 +135,7 @@ void parallel_task_init()
 void parallel_task_deinit()
 void parallel_task_deinit()
 {
 {
 	int i;
 	int i;
-	for (i=0; i<2;i++)
+	for (i=0; i<N_NESTED_CTXS;i++)
 		starpu_sched_ctx_delete(contexts[i].id);
 		starpu_sched_ctx_delete(contexts[i].id);
 	free(contexts);
 	free(contexts);
 	free(main_context.cpus);
 	free(main_context.cpus);
@@ -174,7 +178,7 @@ int main(int argc, char **argv)
 		return 77;
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-	if (starpu_cpu_worker_get_count() < 2)
+	if (starpu_cpu_worker_get_count() < N_NESTED_CTXS)
 	{
 	{
 		starpu_shutdown();
 		starpu_shutdown();
 		return 77;
 		return 77;