|
@@ -21,6 +21,10 @@
|
|
|
#include <starpu_mpi_cache.h>
|
|
|
#include <starpu_mpi_cache_stats.h>
|
|
|
#include <starpu_mpi_private.h>
|
|
|
+#include <mpi_failure_tolerance/starpu_mpi_ft_stats.h>
|
|
|
+
|
|
|
+#define STARPU_CACHE_IN_CACHE (1U<<0U)
|
|
|
+#define STARPU_CACHE_FT_INDUCED_IN_CACHE (1<<1)
|
|
|
|
|
|
/* Whether we are allowed to keep copies of remote data. */
|
|
|
struct _starpu_data_entry
|
|
@@ -129,11 +133,13 @@ void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle)
|
|
|
return;
|
|
|
|
|
|
STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
|
|
|
- mpi_data->cache_received = 0;
|
|
|
+ mpi_data->cache_received.in_cache = 0;
|
|
|
+ mpi_data->cache_received.ft_induced_cache = 0;
|
|
|
_STARPU_MALLOC(mpi_data->cache_sent, _starpu_cache_comm_size*sizeof(mpi_data->cache_sent[0]));
|
|
|
for(i=0 ; i<_starpu_cache_comm_size ; i++)
|
|
|
{
|
|
|
- mpi_data->cache_sent[i] = 0;
|
|
|
+ mpi_data->cache_sent[i].in_cache = 0;
|
|
|
+ mpi_data->cache_sent[i].ft_induced_cache = 0;
|
|
|
}
|
|
|
STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
|
|
|
}
|
|
@@ -184,13 +190,14 @@ void starpu_mpi_cached_receive_clear(starpu_data_handle_t data_handle)
|
|
|
STARPU_ASSERT(mpi_data->magic == 42);
|
|
|
STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
|
|
|
|
|
|
- if (mpi_data->cache_received == 1)
|
|
|
+ if (mpi_data->cache_received.in_cache == 1)
|
|
|
{
|
|
|
#ifdef STARPU_DEVEL
|
|
|
# warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
|
|
|
#endif
|
|
|
_STARPU_MPI_DEBUG(2, "Clearing receive cache for data %p\n", data_handle);
|
|
|
- mpi_data->cache_received = 0;
|
|
|
+ mpi_data->cache_received.in_cache = 0;
|
|
|
+ mpi_data->cache_received.ft_induced_cache = 0;
|
|
|
starpu_data_invalidate_submit(data_handle);
|
|
|
_starpu_mpi_cache_data_remove_nolock(data_handle);
|
|
|
_starpu_mpi_cache_stats_dec(mpi_rank, data_handle);
|
|
@@ -210,16 +217,22 @@ int starpu_mpi_cached_receive_set(starpu_data_handle_t data_handle)
|
|
|
STARPU_ASSERT(mpi_data->magic == 42);
|
|
|
STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
|
|
|
|
|
|
- int already_received = mpi_data->cache_received;
|
|
|
+ int already_received = mpi_data->cache_received.in_cache;
|
|
|
if (already_received == 0)
|
|
|
{
|
|
|
_STARPU_MPI_DEBUG(2, "Noting that data %p has already been received by %d\n", data_handle, mpi_rank);
|
|
|
- mpi_data->cache_received = 1;
|
|
|
+ mpi_data->cache_received.in_cache = 1;
|
|
|
_starpu_mpi_cache_data_add_nolock(data_handle);
|
|
|
_starpu_mpi_cache_stats_inc(mpi_rank, data_handle);
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
+ if (mpi_data->cache_received.ft_induced_cache == 1)
|
|
|
+ {
|
|
|
+ _STARPU_MPI_FT_STATS_RECV_CACHED_CP_DATA(starpu_data_get_size(data_handle));
|
|
|
+ _STARPU_MPI_FT_STATS_CANCEL_RECV_CP_DATA(starpu_data_get_size(data_handle));
|
|
|
+ mpi_data->cache_received.ft_induced_cache = 0;
|
|
|
+ }
|
|
|
_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data_handle, mpi_rank);
|
|
|
}
|
|
|
STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
|
|
@@ -236,7 +249,7 @@ int starpu_mpi_cached_receive(starpu_data_handle_t data_handle)
|
|
|
|
|
|
STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
|
|
|
STARPU_ASSERT(mpi_data->magic == 42);
|
|
|
- already_received = mpi_data->cache_received;
|
|
|
+ already_received = mpi_data->cache_received.in_cache;
|
|
|
STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
|
|
|
return already_received;
|
|
|
}
|
|
@@ -256,10 +269,11 @@ void starpu_mpi_cached_send_clear(starpu_data_handle_t data_handle)
|
|
|
starpu_mpi_comm_size(mpi_data->node_tag.node.comm, &size);
|
|
|
for(n=0 ; n<size ; n++)
|
|
|
{
|
|
|
- if (mpi_data->cache_sent[n] == 1)
|
|
|
+ if (mpi_data->cache_sent[n].in_cache == 1)
|
|
|
{
|
|
|
_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
|
|
|
- mpi_data->cache_sent[n] = 0;
|
|
|
+ mpi_data->cache_sent[n].in_cache = 0;
|
|
|
+ mpi_data->cache_sent[n].ft_induced_cache = 0;
|
|
|
_starpu_mpi_cache_data_remove_nolock(data_handle);
|
|
|
}
|
|
|
}
|
|
@@ -276,15 +290,21 @@ int starpu_mpi_cached_send_set(starpu_data_handle_t data_handle, int dest)
|
|
|
STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
|
|
|
|
|
|
STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
|
|
|
- int already_sent = mpi_data->cache_sent[dest];
|
|
|
- if (mpi_data->cache_sent[dest] == 0)
|
|
|
+ int already_sent = mpi_data->cache_sent[dest].in_cache;
|
|
|
+ if (mpi_data->cache_sent[dest].in_cache == 0)
|
|
|
{
|
|
|
- mpi_data->cache_sent[dest] = 1;
|
|
|
+ mpi_data->cache_sent[dest].in_cache = 1;
|
|
|
_starpu_mpi_cache_data_add_nolock(data_handle);
|
|
|
_STARPU_MPI_DEBUG(2, "Noting that data %p has already been sent to %d\n", data_handle, dest);
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
+ if (mpi_data->cache_sent[dest].ft_induced_cache == 1)
|
|
|
+ {
|
|
|
+ _STARPU_MPI_FT_STATS_SEND_CACHED_CP_DATA(starpu_data_get_size(data_handle));
|
|
|
+ _STARPU_MPI_FT_STATS_CANCEL_SEND_CP_DATA(starpu_data_get_size(data_handle));
|
|
|
+ mpi_data->cache_sent[dest].ft_induced_cache = 0;
|
|
|
+ }
|
|
|
_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data_handle, dest);
|
|
|
}
|
|
|
STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
|
|
@@ -301,7 +321,7 @@ int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
|
|
|
|
|
|
STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
|
|
|
STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
|
|
|
- already_sent = mpi_data->cache_sent[dest];
|
|
|
+ already_sent = mpi_data->cache_sent[dest].in_cache;
|
|
|
STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
|
|
|
return already_sent;
|
|
|
}
|
|
@@ -317,19 +337,21 @@ static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle)
|
|
|
starpu_mpi_comm_size(mpi_data->node_tag.node.comm, &nb_nodes);
|
|
|
for(i=0 ; i<nb_nodes ; i++)
|
|
|
{
|
|
|
- if (mpi_data->cache_sent[i] == 1)
|
|
|
+ if (mpi_data->cache_sent[i].in_cache == 1)
|
|
|
{
|
|
|
_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
|
|
|
- mpi_data->cache_sent[i] = 0;
|
|
|
+ mpi_data->cache_sent[i].in_cache = 0;
|
|
|
+ mpi_data->cache_sent[i].ft_induced_cache = 0;
|
|
|
_starpu_mpi_cache_stats_dec(i, data_handle);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (mpi_data->cache_received == 1)
|
|
|
+ if (mpi_data->cache_received.in_cache == 1)
|
|
|
{
|
|
|
int mpi_rank = starpu_mpi_data_get_rank(data_handle);
|
|
|
_STARPU_MPI_DEBUG(2, "Clearing received cache for data %p\n", data_handle);
|
|
|
- mpi_data->cache_received = 0;
|
|
|
+ mpi_data->cache_received.in_cache = 0;
|
|
|
+ mpi_data->cache_received.ft_induced_cache = 0;
|
|
|
_starpu_mpi_cache_stats_dec(mpi_rank, data_handle);
|
|
|
}
|
|
|
}
|