Bläddra i källkod

Detect MIC program termination while trying to connect to it

Samuel Thibault 9 år sedan
förälder
incheckning
eb81ce18d9

+ 3 - 3
src/core/topology.c

@@ -795,7 +795,7 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 
 #ifdef STARPU_USE_MIC
 static COIENGINE mic_handles[STARPU_MAXMICDEVS];
-static COIPROCESS mic_process[STARPU_MAXMICDEVS];
+COIPROCESS _starpu_mic_process[STARPU_MAXMICDEVS];
 #endif
 
 static void
@@ -845,7 +845,7 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 	topology->nmicdevices = 0;
 	unsigned i;
 	for (i = 0; i < (unsigned) reqmicdevices; i++)
-		if (0 == _starpu_init_mic_node (config, i, &mic_handles[i], &mic_process[i]))
+		if (0 == _starpu_init_mic_node (config, i, &mic_handles[i], &_starpu_mic_process[i]))
 			topology->nmicdevices++;
 
 
@@ -859,7 +859,7 @@ _starpu_deinit_mic_node (unsigned mic_idx)
 {
 	_starpu_mp_common_send_command(mic_nodes[mic_idx], STARPU_EXIT, NULL, 0);
 
-	COIProcessDestroy(mic_process[mic_idx], -1, 0, NULL, NULL);
+	COIProcessDestroy(_starpu_mic_process[mic_idx], -1, 0, NULL, NULL);
 
 	_starpu_mp_common_node_destroy(mic_nodes[mic_idx]);
 }

+ 11 - 1
src/drivers/mic/driver_mic_common.c

@@ -79,7 +79,7 @@ void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *mp_node, void *msg
 		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);
 }
 
-void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node,
+void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node, COIPROCESS process,
 				uint16_t local_port_number, uint16_t remote_port_number)
 {
 	/* Endpoint only useful for the initialization of the connection */
@@ -97,6 +97,16 @@ void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node,
 	_STARPU_DEBUG("Connecting to MIC %d on %d:%d...\n", remote_node, local_port_number, remote_port_number);
 	while (scif_connect(*endpoint, &portID) == -1)
 	{
+		if (process)
+		{
+			const char *main_name = "main";
+			COIFUNCTION func;
+			COIRESULT res;
+			/* Check whether it's still alive */
+			res = COIProcessGetFunctionHandles(process, 1, &main_name, &func);
+			STARPU_ASSERT_MSG(res != COI_PROCESS_DIED, "process died on MIC %d", remote_node-1);
+			STARPU_ASSERT_MSG(res == COI_SUCCESS, "MIC process died? (error %d)", res);
+		}
 		if (errno != ECONNREFUSED)
 			STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
 	}

+ 2 - 1
src/drivers/mic/driver_mic_common.h

@@ -19,6 +19,7 @@
 #define __DRIVER_MIC_COMMON_H__
 
 
+#include <source/COIProcess_source.h>
 #include <common/config.h>
 
 
@@ -64,7 +65,7 @@ void _starpu_mic_common_dt_send(const struct _starpu_mp_node *node, void *msg, i
 
 void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *node, void *msg, int len);
 
-void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node, 
+void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node, COIPROCESS process,
 				uint16_t local_port_number, uint16_t remote_port_number);
 void _starpu_mic_common_accept(scif_epd_t *endpoint, uint16_t port_number);
 

+ 1 - 0
src/drivers/mic/driver_mic_sink.c

@@ -58,6 +58,7 @@ void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 	//for (i = 0; i < (unsigned int)node->devid; ++i)
 	//	_starpu_mic_common_connect(&node->sink_sink_dt_connections[i].mic_endpoint,
 	//								STARPU_TO_MIC_ID(i),
+	//								NULL,
 	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(node->devid, i),	
 	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(i, node->devid));
 

+ 4 - 0
src/drivers/mic/driver_mic_source.c

@@ -318,14 +318,18 @@ void(* _starpu_mic_src_get_kernel_from_job(const struct _starpu_mp_node *node ST
  */
 void _starpu_mic_src_init(struct _starpu_mp_node *node)
 {
+	extern COIPROCESS _starpu_mic_process[STARPU_MAXMICDEVS];
+
 	/* Let's initialize the connection with the peered sink device */
 	_starpu_mic_common_connect(&node->mp_connection.mic_endpoint,
 					    STARPU_TO_MIC_ID(node->peer_id),
+					    _starpu_mic_process[node->peer_id],
 					    STARPU_MIC_SINK_PORT_NUMBER(node->peer_id),
 					    STARPU_MIC_SOURCE_PORT_NUMBER);
 
 	_starpu_mic_common_connect(&node->host_sink_dt_connection.mic_endpoint,
 				   STARPU_TO_MIC_ID(node->peer_id),
+				   _starpu_mic_process[node->peer_id],
 				   STARPU_MIC_SINK_DT_PORT_NUMBER(node->peer_id),
 				   STARPU_MIC_SOURCE_DT_PORT_NUMBER);
 }