瀏覽代碼

Enable multiple perfmodels in mpi/simgrid-smpi

Lucas Leandro Nesi 5 年之前
父節點
當前提交
c27d4285de

+ 10 - 4
doc/doxygen/chapters/470_simgrid.doxy

@@ -132,7 +132,10 @@ machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
 Simulation step on the desktop machine, by setting the environment
 variable \ref STARPU_HOSTNAME to the name of the actual machine, to
 make StarPU use the performance models of the simulated machine even
-on the desktop machine.
+on the desktop machine. To use multiple performance models in different ranks,
+in case of smpi executions in a heterogeneous platform, it is possible to use the
+option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
+\ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 
 If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
 use SimGrid to simulate execution with CUDA/OpenCL devices, but the application
@@ -171,9 +174,12 @@ $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mp
 \endverbatim
 
 Where \c cluster.xml is a SimGrid-MPI platform description, and \c hostfile the
-list of MPI nodes to be used. StarPU currently only supports homogeneous MPI
-clusters: for each MPI node it will just replicate the architecture referred by
-\ref STARPU_HOSTNAME.
+list of MPI nodes to be used. In homogeneous MPI clusters: for each MPI node it
+will just replicate the architecture referred by
+\ref STARPU_HOSTNAME. To use multiple performance models in different ranks,
+in case of a heterogeneous platform, it is possible to use the
+option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
+\ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 
 \section SimulationDebuggingApplications Debugging Applications
 

+ 14 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -866,6 +866,20 @@ a homogenenous cluster, it is possible to share the models between
 machines by setting <c>export STARPU_HOSTNAME=some_global_name</c>.
 </dd>
 
+<dt>STARPU_MPI_HOSTNAMES</dt>
+<dd>
+\anchor STARPU_MPI_HOSTNAMES
+\addindex __env__STARPU_MPI_HOSTNAMES
+Similar to \ref STARPU_HOSTNAME but to define multiple nodes on a
+heterogeneous cluster. The variable is a list of hostnames that will be assigned
+to each StarPU-MPI rank considering their position and the value of
+\ref starpu_mpi_world_rank on each rank. When running, for example, on a
+heterogeneous cluster, it is possible to set individual models for each machine
+by setting <c>export STARPU_MPI_HOSTNAMES="name0 name1 name2"</c>. Where rank 0
+will receive name0, rank1 will receive name1, and so on.
+This variable has precedence over \ref STARPU_HOSTNAME.
+</dd>
+
 <dt>STARPU_OPENCL_PROGRAM_DIR</dt>
 <dd>
 \anchor STARPU_OPENCL_PROGRAM_DIR

+ 30 - 1
src/common/utils.c

@@ -516,10 +516,39 @@ char *_starpu_get_home_path(void)
 	return path;
 }
 
+#pragma weak starpu_mpi_world_rank
+int starpu_mpi_world_rank()
+{
+	_STARPU_DISP("StarPU-MPI unavailable, the rank of this process is 0");
+	return 0;
+}
+
 void _starpu_gethostname(char *hostname, size_t size)
 {
+	char *force_mpi_hostnames = starpu_getenv("STARPU_MPI_HOSTNAMES");
 	char *forced_hostname = starpu_getenv("STARPU_HOSTNAME");
-	if (forced_hostname && forced_hostname[0])
+
+	if (force_mpi_hostnames && force_mpi_hostnames[0])
+	{
+		char *host, *srv_hosts, *srv_hosts_free, *rsrv;
+		srv_hosts = srv_hosts_free = (char*)malloc(strlen(force_mpi_hostnames)+1);
+		snprintf(srv_hosts, strlen(force_mpi_hostnames)+1, "%s", force_mpi_hostnames);
+		int rank = starpu_mpi_world_rank();
+		if (force_mpi_hostnames != NULL)
+		{
+			host = strtok_r(srv_hosts, " ", &rsrv);
+			while (rank-->0 && (host = strtok_r(NULL, " ", &rsrv)));
+			if(rank>=0)
+			{
+				_STARPU_MSG("Missing hostnames in STARPU_MPI_HOSTNAMES\n");
+				STARPU_ABORT();
+			}
+		}
+		snprintf(hostname, size-1, "%s", host);
+		free(srv_hosts_free);
+		hostname[size-1] = 0;
+	}
+	else if (forced_hostname && forced_hostname[0])
 	{
 		snprintf(hostname, size-1, "%s", forced_hostname);
 		hostname[size-1] = 0;

+ 28 - 3
tools/starpu_smpirun.in

@@ -62,6 +62,7 @@ MPI_PLATFORM=""
 MPI_HOSTFILE=""
 NP=""
 GDB=""
+HOSTFILE_PLATFORM_DETECT=""
 while true; do
 	case "$1" in
 		"-platform")
@@ -84,6 +85,10 @@ while true; do
 			NP=$2
 			shift 2
 			;;
+		"-hostfile-platform")
+			HOSTFILE_PLATFORM_DETECT=1
+			shift 1
+			;;
 		"-gdb")
 			GDB="-gdb"
 			shift 1
@@ -114,6 +119,12 @@ then
 	exit 1
 fi
 
+if [ -n "$HOSTFILE_PLATFORM_DETECT" ]
+then
+	HOSTS=$(grep -v "^$" $MPI_HOSTFILE)
+	export STARPU_MPI_HOSTNAMES=$(echo $HOSTS | tr -d '\011\012\015')
+fi
+
 (
 	cat << EOF
 <?xml version='1.0'?>
@@ -122,9 +133,23 @@ fi
 <AS id="ASroot" routing="None">
 EOF
 	tail -n +3 $MPI_PLATFORM | grep -v '<platform' | grep -v '</platform'
-	for i in $(seq 0 $((NP - 1))) ; do
-		xsltproc --novalid --stringparam ASname StarPU-MPI$i $STARPU_XSLTDIR/starpu_smpi.xslt $NODE_PLATFORM | grep -v network/ | tail -n +4 | head -n -1
-	done
+	if [ -n "$HOSTFILE_PLATFORM_DETECT" ]
+	then
+		i=0
+		for h in $HOSTS ; do
+			NODE_PLATFORM=$STARPU_PERF_MODEL_DIR/bus/${h}.platform$VF.xml
+			if [ ! -f "$NODE_PLATFORM" ]; then
+				echo File $NODE_PLATFORM do not exist, but ${h} is on hostfile.
+				exit 1
+			fi
+			xsltproc --novalid --stringparam ASname StarPU-MPI$i $STARPU_XSLTDIR/starpu_smpi.xslt $NODE_PLATFORM | grep -v network/ | tail -n +4 | head -n -1
+			i=$(expr $i + 1)
+		done
+	else
+		for i in $(seq 0 $((NP - 1))) ; do
+			xsltproc --novalid --stringparam ASname StarPU-MPI$i $STARPU_XSLTDIR/starpu_smpi.xslt $NODE_PLATFORM | grep -v network/ | tail -n +4 | head -n -1
+		done
+	fi
 	cat << \EOF
 </AS>
 </platform>