浏览代码

Doc & more dev

Romain LION 5 年之前
父节点
当前提交
cbd84349e8
共有 4 个文件被更改,包括 121 次插入27 次删除
  1. 18 2
      mpi/include/starpu_mpi_ft.h
  2. 57 19
      mpi/src/starpu_mpi_checkpoint.c
  3. 20 1
      mpi/src/starpu_mpi_checkpoint.h
  4. 26 5
      mpi/tests/checkpoints.c

+ 18 - 2
mpi/include/starpu_mpi_ft.h

@@ -20,7 +20,23 @@
 struct _starpu_mpi_checkpoint_template;
 typedef struct _starpu_mpi_checkpoint_template* starpu_mpi_checkpoint_template;
 
-int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_template, ...);
-int starpu_mpi_checkpoint_template_print(starpu_mpi_checkpoint_template cp_template);
+
+/**
+ * Registers a checkpoint template \p cp_template with the given arguments.
+ * It is then ready to use with ::starpu_mpi_checkpoint_template_submit during the program execution.
+ * A unique checkpoint id \p cp_id is requested from the user in order to
+ * match with a corresponding ::starpu_mpi_init_from_checkpoint.
+ *
+ * The arguments following the cp_template and the checkpoint_id can be of the following types:
+ * <ul>
+ * <li> ::STARPU_R followed by a data handle and the backup rank;
+ * <li> ::STARPU_DATA_ARRAY followed by an array of data handles,
+ * its number of elements and a backup rank;
+ * <li> ::STARPU_VALUE followed by a pointer to the unregistered value,
+ * its size in bytes and a backup rank.
+ * <li> The argument list must be ended by the value 0.
+ * </ul>
+ */
+int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_template, int cp_id, ...);
 
 #endif //FT_STARPU_STARPU_MPI_FT_H

+ 57 - 19
mpi/src/starpu_mpi_checkpoint.c

@@ -16,18 +16,24 @@
 
 
 #include <stdarg.h>
+#include <stdlib.h>
 #include <common/utils.h>
 
 #include <starpu_mpi_checkpoint.h>
+#include <sys/param.h>
 
-int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template*
-cp_template, va_list varg_list)
+#define MAX_CP_TEMPLATE_NUMBER 32
+
+starpu_pthread_mutex_t cp_template_mutex;
+starpu_mpi_checkpoint_template cp_template_array[MAX_CP_TEMPLATE_NUMBER];
+int cp_template_number = 0;
+
+int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_template, int cp_id, va_list varg_list)
 {
 	int i = 0;
 	int arg_type;
 
-	starpu_mpi_checkpoint_template _cp_template;
-	_STARPU_MALLOC(_cp_template, sizeof(struct _starpu_mpi_checkpoint_template));
+	starpu_mpi_checkpoint_template _cp_template = _starpu_mpi_checkpoint_create();
 
 	va_list varg_list_copy;
 	va_copy(varg_list_copy, varg_list);
@@ -46,20 +52,23 @@ cp_template, va_list varg_list)
 		}
 		else if (arg_type==STARPU_R)
 		{
-			_cp_template->items[i].type = STARPU_R;
-			_cp_template->items[i].ptr = va_arg(varg_list_copy, void*);
+			_cp_template->items[i].type        = STARPU_R;
+			_cp_template->items[i].ptr         = va_arg(varg_list_copy, void*);
 			_cp_template->items[i].backup_rank = va_arg(varg_list_copy, int);
 		}
 		else if (arg_type==STARPU_VALUE)
 		{
-			_cp_template->items[i].type = STARPU_VALUE;
-			_cp_template->items[i].ptr = va_arg(varg_list_copy,void*);
-			_cp_template->items[i].count = va_arg(varg_list_copy, int);
+			_cp_template->items[i].type        = STARPU_VALUE;
+			_cp_template->items[i].ptr         = va_arg(varg_list_copy,void*);
+			_cp_template->items[i].count       = va_arg(varg_list_copy, int);
 			_cp_template->items[i].backup_rank = va_arg(varg_list_copy, int);
 		}
 		else if (arg_type==STARPU_DATA_ARRAY)
 		{
-
+			_cp_template->items[i].type        = STARPU_DATA_ARRAY;
+			_cp_template->items[i].ptr         = va_arg(varg_list_copy,void*);
+			_cp_template->items[i].count       = va_arg(varg_list_copy, int);
+			_cp_template->items[i].backup_rank = va_arg(varg_list_copy, int);
 		}
 		else
 		{
@@ -71,33 +80,62 @@ cp_template, va_list varg_list)
 	va_end(varg_list_copy);
 
 	_cp_template->size = i;
-	_cp_template->checkpoint_id = 50909;
+	starpu_sem_init(&_cp_template->completion_sem, 0, _cp_template->size-1);
+	_cp_template->cp_template_id = cp_id;
+
+
 
 	*cp_template = _cp_template;
 
 	return 0;
 }
 
-int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_template, ...) {
+int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_template, int cp_id, ...)
+{
 	va_list varg_list;
-	va_start(varg_list, cp_template);
-	int ret = _starpu_mpi_checkpoint_template_register(cp_template, varg_list);
+	va_start(varg_list, cp_id);
+	int ret = _starpu_mpi_checkpoint_template_register(cp_template, cp_id, varg_list);
 	va_end(varg_list);
 	return ret;
 }
 
-int starpu_mpi_checkpoint_template_print(starpu_mpi_checkpoint_template cp_template) {
+// For test purpose
+int _starpu_mpi_checkpoint_template_print(starpu_mpi_checkpoint_template cp_template)
+{
 	int val;
-	for (int i=0 ; i< cp_template->size ; i++) {
+	for (int i=0 ; i< cp_template->size ; i++)
+	{
 		fprintf(stderr,"Item %2d: ", i);
-		if (cp_template->items[i].type == STARPU_VALUE) {
+		if (cp_template->items[i].type == STARPU_VALUE)
+		{
 			printf("STARPU_VALUE - Value=%d\n", (*(int *)(cp_template->items[i].ptr)));
-		} else if (cp_template->items[i].type == STARPU_R) {
+		}
+		else if (cp_template->items[i].type == STARPU_R)
+		{
 			val = *(int*)starpu_data_handle_to_pointer(*(starpu_data_handle_t*)(cp_template->items[i].ptr), 0);
 			printf("STARPU_R - Value=%d\n", val);
-		} else {
+
+		}
+		else if (cp_template->items[i].type == STARPU_DATA_ARRAY)
+		{
+			fprintf(stderr, "STARPU_DATA_ARRAY - Multiple values: %d", *(int*)starpu_data_handle_to_pointer(*((starpu_data_handle_t*)cp_template->items[i].ptr), 0));
+
+			for (int j=1 ; j<MIN(cp_template->items[i].count, 5) ; j++)
+			{
+				fprintf(stderr, ", %d", *(int*)starpu_data_handle_to_pointer(((starpu_data_handle_t*)cp_template->items[i].ptr)[j], 0)); //j*sizeof(starpu_data_handle_t)
+			}
+			fprintf(stderr, "...\n");
+		}
+		else
+		{
 			printf("Unrecognized type.\n");
 		}
 	}
 	return 0;
+}
+
+int _starpu_mpi_checkpoint_turn_on(void)
+{
+	starpu_pthread_mutex_init(&cp_template_mutex, NULL);
+	return 0;
 }

+ 20 - 1
mpi/src/starpu_mpi_checkpoint.h

@@ -21,6 +21,8 @@
 
 #define CHECKPOINT_STRUCTURE_MAX_SIZE 32
 
+
+// TODO: make template as an unlimited chained list
 struct _starpu_mpi_checkpoint_template_item{
     int type;
     void* ptr;
@@ -31,8 +33,25 @@ struct _starpu_mpi_checkpoint_template_item{
 struct _starpu_mpi_checkpoint_template{
     struct _starpu_mpi_checkpoint_template_item items[CHECKPOINT_STRUCTURE_MAX_SIZE];
     int size;
-    int checkpoint_id;
+    int cp_template_id;
+    int pending;
+    starpu_sem_t completion_sem;
 };
 
+starpu_mpi_checkpoint_template _starpu_mpi_checkpoint_create(void)
+{
+	starpu_mpi_checkpoint_template _cp_template;
+	_STARPU_MALLOC(_cp_template, sizeof(struct _starpu_mpi_checkpoint_template));
+	assert(_cp_template!=NULL);
+	_cp_template->pending = 0;
+	//starpu_sem_init(&_cp_template->completion_sem, 0, 0);
+	return _cp_template;
+}
+
+int _starpu_mpi_checkpoint_turn_on(void);
+
+// For test purpose
+int _starpu_mpi_checkpoint_template_print(starpu_mpi_checkpoint_template cp_template);
+
 
 #endif //FT_STARPU_STARPU_MPI_CHECKPOINT_H

+ 26 - 5
mpi/tests/checkpoints.c

@@ -17,20 +17,36 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 
+#include <starpu_mpi_checkpoint.h>
+
+#define ARRAY_SIZE 12
+
 int main(int argc, char* argv[])
 {
-    starpu_data_handle_t h;
+	starpu_data_handle_t h;
+	starpu_data_handle_t h_array[ARRAY_SIZE];
     starpu_mpi_checkpoint_template cp_template;
     int val = 42;
     int val2 = 1234;
+    int array[ARRAY_SIZE];
     int ret;
     struct starpu_conf conf;
 
+    //init array
+    for (int i=0 ; i<ARRAY_SIZE ; i++)
+    {
+    	array[i] = i*1111+42;
+    }
+
+	for (int i=0 ; i<ARRAY_SIZE ; i++)
+	{
+		h_array[i] = NULL;
+	}
+
     starpu_conf_init(&conf);
     conf.nmic = 0;
     conf.nmpi_ms = 0;
 
-    FPRINTF(stderr, "Go\n");
     ret = starpu_init(&conf);
     if (STARPU_UNLIKELY(ret == -ENODEV))
     {
@@ -45,14 +61,19 @@ int main(int argc, char* argv[])
         return 77;
     }
 
-    FPRINTF(stderr, "init\n");
     starpu_variable_data_register(&h, STARPU_MAIN_RAM, (uintptr_t)&val2, sizeof(int));
-    FPRINTF(stderr, "registered\n");
+
+    starpu_vector_data_register(h_array, STARPU_MAIN_RAM, (uintptr_t)array, ARRAY_SIZE, sizeof(int));
+    for (int i=0 ; i<ARRAY_SIZE ; i++) {
+    	starpu_variable_data_register(&h_array[i], STARPU_MAIN_RAM, (uintptr_t)&array[i], sizeof(int));
+    }
+
     starpu_mpi_checkpoint_template_register(&cp_template,
            STARPU_VALUE, &val, sizeof(int), 1,
+           STARPU_DATA_ARRAY, h_array, ARRAY_SIZE, 1,
            STARPU_R, &h, 1,
            0);
-    starpu_mpi_checkpoint_template_print(cp_template);
+    _starpu_mpi_checkpoint_template_print(cp_template);
     return 0;
 }