瀏覽代碼

Display hostname in CPU binding errors

To know which node is faulty when using MPI
Philippe SWARTVAGHER 5 年之前
父節點
當前提交
00dd382d25
共有 3 個文件被更改,包括 13 次插入6 次删除
  1. 3 1
      mpi/src/mpi/starpu_mpi_mpi.c
  2. 3 1
      mpi/src/nmad/starpu_mpi_nmad.c
  3. 7 4
      src/core/topology.c

+ 3 - 1
mpi/src/mpi/starpu_mpi_mpi.c

@@ -1133,7 +1133,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
 	{
-		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n");
+		char hostname[65];
+		_starpu_gethostname(hostname, sizeof(hostname));
+		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	_starpu_mpi_do_initialize(argc_argv);
 	if (_starpu_mpi_thread_cpuid >= 0)

+ 3 - 1
mpi/src/nmad/starpu_mpi_nmad.c

@@ -414,7 +414,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
 	{
-		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n");
+		char hostname[65];
+		_starpu_gethostname(hostname, sizeof(hostname));
+		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	_starpu_mpi_do_initialize(argc_argv);
 	if (_starpu_mpi_thread_cpuid >= 0)

+ 7 - 4
src/core/topology.c

@@ -1924,12 +1924,15 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 			 (previous >= 0 && previous == workerid) ||
 			 (name && cpu_name[cpuid] && !strcmp(name, cpu_name[cpuid])) ) )
 		{
+			char hostname[65];
+			_starpu_gethostname(hostname, sizeof(hostname));
+
 			if (previous == STARPU_ACTIVETHREAD)
-				_STARPU_DISP("Warning: active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
+				_STARPU_DISP("[%s] Warning: active thread %s was already bound to PU %d\n", hostname, cpu_name[cpuid], cpuid);
 			else if (previous == STARPU_NONACTIVETHREAD)
-				_STARPU_DISP("Warning: non-active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
+				_STARPU_DISP("[%s] Warning: non-active thread %s was already bound to PU %d\n", hostname, cpu_name[cpuid], cpuid);
 			else
-				_STARPU_DISP("Warning: worker %d was already bound to PU %d\n", previous, cpuid);
+				_STARPU_DISP("[%s] Warning: worker %d was already bound to PU %d\n", hostname, previous, cpuid);
 
 			if (workerid == STARPU_ACTIVETHREAD)
 				_STARPU_DISP("and we were told to also bind active thread %s to it.\n", name);
@@ -1942,7 +1945,7 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 
 			if (workerid >= 0)
 				/* This shouldn't happen for workers */
-				_STARPU_DISP("Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", config->topology.nhwcpus, config->topology.nhwpus);
+				_STARPU_DISP("[%s] Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", hostname, config->topology.nhwcpus, config->topology.nhwpus);
 			ret = -1;
 		}
 		else