Преглед изворни кода

Enable multiple perfmodels in mpi/simgrid-smpi

Lucas Leandro Nesi пре 5 година
родитељ
комит
c27d4285de

+ 10 - 4
doc/doxygen/chapters/470_simgrid.doxy

@@ -132,7 +132,10 @@ machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
 Simulation step on the desktop machine, by setting the environment
 Simulation step on the desktop machine, by setting the environment
 variable \ref STARPU_HOSTNAME to the name of the actual machine, to
 variable \ref STARPU_HOSTNAME to the name of the actual machine, to
 make StarPU use the performance models of the simulated machine even
 make StarPU use the performance models of the simulated machine even
-on the desktop machine.
+on the desktop machine. To use multiple performance models in different ranks,
+in case of smpi executions in a heterogeneous platform, it is possible to use the
+option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
+\ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 
 
 If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
 If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
 use SimGrid to simulate execution with CUDA/OpenCL devices, but the application
 use SimGrid to simulate execution with CUDA/OpenCL devices, but the application
@@ -171,9 +174,12 @@ $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mp
 \endverbatim
 \endverbatim
 
 
 Where \c cluster.xml is a SimGrid-MPI platform description, and \c hostfile the
 Where \c cluster.xml is a SimGrid-MPI platform description, and \c hostfile the
-list of MPI nodes to be used. StarPU currently only supports homogeneous MPI
-clusters: for each MPI node it will just replicate the architecture referred by
-\ref STARPU_HOSTNAME.
+list of MPI nodes to be used. In homogeneous MPI clusters: for each MPI node it
+will just replicate the architecture referred by
+\ref STARPU_HOSTNAME. To use multiple performance models in different ranks,
+in case of a heterogeneous platform, it is possible to use the
+option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
+\ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 
 
 \section SimulationDebuggingApplications Debugging Applications
 \section SimulationDebuggingApplications Debugging Applications
 
 

+ 14 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -866,6 +866,20 @@ a homogenenous cluster, it is possible to share the models between
 machines by setting <c>export STARPU_HOSTNAME=some_global_name</c>.
 machines by setting <c>export STARPU_HOSTNAME=some_global_name</c>.
 </dd>
 </dd>
 
 
+<dt>STARPU_MPI_HOSTNAMES</dt>
+<dd>
+\anchor STARPU_MPI_HOSTNAMES
+\addindex __env__STARPU_MPI_HOSTNAMES
+Similar to \ref STARPU_HOSTNAME but to define multiple nodes on a
+heterogeneous cluster. The variable is a list of hostnames that will be assigned
+to each StarPU-MPI rank considering their position and the value of
+\ref starpu_mpi_world_rank on each rank. When running, for example, on a
+heterogeneous cluster, it is possible to set individual models for each machine
+by setting <c>export STARPU_MPI_HOSTNAMES="name0 name1 name2"</c>. Where rank 0
+will receive name0, rank1 will receive name1, and so on.
+This variable has precedence over \ref STARPU_HOSTNAME.
+</dd>
+
 <dt>STARPU_OPENCL_PROGRAM_DIR</dt>
 <dt>STARPU_OPENCL_PROGRAM_DIR</dt>
 <dd>
 <dd>
 \anchor STARPU_OPENCL_PROGRAM_DIR
 \anchor STARPU_OPENCL_PROGRAM_DIR

+ 30 - 1
src/common/utils.c

@@ -516,10 +516,39 @@ char *_starpu_get_home_path(void)
 	return path;
 	return path;
 }
 }
 
 
+#pragma weak starpu_mpi_world_rank
+int starpu_mpi_world_rank()
+{
+	_STARPU_DISP("StarPU-MPI unavailable, the rank of this process is 0");
+	return 0;
+}
+
 void _starpu_gethostname(char *hostname, size_t size)
 void _starpu_gethostname(char *hostname, size_t size)
 {
 {
+	char *force_mpi_hostnames = starpu_getenv("STARPU_MPI_HOSTNAMES");
 	char *forced_hostname = starpu_getenv("STARPU_HOSTNAME");
 	char *forced_hostname = starpu_getenv("STARPU_HOSTNAME");
-	if (forced_hostname && forced_hostname[0])
+
+	if (force_mpi_hostnames && force_mpi_hostnames[0])
+	{
+		char *host, *srv_hosts, *srv_hosts_free, *rsrv;
+		srv_hosts = srv_hosts_free = (char*)malloc(strlen(force_mpi_hostnames)+1);
+		snprintf(srv_hosts, strlen(force_mpi_hostnames)+1, "%s", force_mpi_hostnames);
+		int rank = starpu_mpi_world_rank();
+		if (force_mpi_hostnames != NULL)
+		{
+			host = strtok_r(srv_hosts, " ", &rsrv);
+			while (rank-->0 && (host = strtok_r(NULL, " ", &rsrv)));
+			if(rank>=0)
+			{
+				_STARPU_MSG("Missing hostnames in STARPU_MPI_HOSTNAMES\n");
+				STARPU_ABORT();
+			}
+		}
+		snprintf(hostname, size-1, "%s", host);
+		free(srv_hosts_free);
+		hostname[size-1] = 0;
+	}
+	else if (forced_hostname && forced_hostname[0])
 	{
 	{
 		snprintf(hostname, size-1, "%s", forced_hostname);
 		snprintf(hostname, size-1, "%s", forced_hostname);
 		hostname[size-1] = 0;
 		hostname[size-1] = 0;

+ 28 - 3
tools/starpu_smpirun.in

@@ -62,6 +62,7 @@ MPI_PLATFORM=""
 MPI_HOSTFILE=""
 MPI_HOSTFILE=""
 NP=""
 NP=""
 GDB=""
 GDB=""
+HOSTFILE_PLATFORM_DETECT=""
 while true; do
 while true; do
 	case "$1" in
 	case "$1" in
 		"-platform")
 		"-platform")
@@ -84,6 +85,10 @@ while true; do
 			NP=$2
 			NP=$2
 			shift 2
 			shift 2
 			;;
 			;;
+		"-hostfile-platform")
+			HOSTFILE_PLATFORM_DETECT=1
+			shift 1
+			;;
 		"-gdb")
 		"-gdb")
 			GDB="-gdb"
 			GDB="-gdb"
 			shift 1
 			shift 1
@@ -114,6 +119,12 @@ then
 	exit 1
 	exit 1
 fi
 fi
 
 
+if [ -n "$HOSTFILE_PLATFORM_DETECT" ]
+then
+	HOSTS=$(grep -v "^$" $MPI_HOSTFILE)
+	export STARPU_MPI_HOSTNAMES=$(echo $HOSTS | tr -d '\011\012\015')
+fi
+
 (
 (
 	cat << EOF
 	cat << EOF
 <?xml version='1.0'?>
 <?xml version='1.0'?>
@@ -122,9 +133,23 @@ fi
 <AS id="ASroot" routing="None">
 <AS id="ASroot" routing="None">
 EOF
 EOF
 	tail -n +3 $MPI_PLATFORM | grep -v '<platform' | grep -v '</platform'
 	tail -n +3 $MPI_PLATFORM | grep -v '<platform' | grep -v '</platform'
-	for i in $(seq 0 $((NP - 1))) ; do
-		xsltproc --novalid --stringparam ASname StarPU-MPI$i $STARPU_XSLTDIR/starpu_smpi.xslt $NODE_PLATFORM | grep -v network/ | tail -n +4 | head -n -1
-	done
+	if [ -n "$HOSTFILE_PLATFORM_DETECT" ]
+	then
+		i=0
+		for h in $HOSTS ; do
+			NODE_PLATFORM=$STARPU_PERF_MODEL_DIR/bus/${h}.platform$VF.xml
+			if [ ! -f "$NODE_PLATFORM" ]; then
+				echo File $NODE_PLATFORM do not exist, but ${h} is on hostfile.
+				exit 1
+			fi
+			xsltproc --novalid --stringparam ASname StarPU-MPI$i $STARPU_XSLTDIR/starpu_smpi.xslt $NODE_PLATFORM | grep -v network/ | tail -n +4 | head -n -1
+			i=$(expr $i + 1)
+		done
+	else
+		for i in $(seq 0 $((NP - 1))) ; do
+			xsltproc --novalid --stringparam ASname StarPU-MPI$i $STARPU_XSLTDIR/starpu_smpi.xslt $NODE_PLATFORM | grep -v network/ | tail -n +4 | head -n -1
+		done
+	fi
 	cat << \EOF
 	cat << \EOF
 </AS>
 </AS>
 </platform>
 </platform>