瀏覽代碼

perfmodel: pre-allocating 2**ndevices arch_combs may be too big, we
only allocate 2 * ndevices, and we reallocate when necessary.
this means a perfmodel has to store the number of combinations it has
currently allocated.

Nathalie Furmento 11 年之前
父節點
當前提交
8a472c0c88
共有 3 個文件被更改,包括 48 次插入16 次删除
  1. 5 1
      include/starpu_perfmodel.h
  2. 2 0
      src/core/perfmodel/perfmodel.h
  3. 41 15
      src/core/perfmodel/perfmodel_history.c

+ 5 - 1
include/starpu_perfmodel.h

@@ -136,12 +136,16 @@ struct starpu_perfmodel
 
 	const char *symbol;
 
+#ifdef STARPU_DEVEL
+#warning move all the fields in a private structure
+#endif
 	unsigned is_init;
 	unsigned is_loaded;
 	unsigned benchmarking;
 	starpu_pthread_rwlock_t model_rwlock;
 	int *nimpls;
-	int ncombs;
+	int ncombs;  /* The number of combinations currently used by the model */
+	int ncombs_set; /* The number of combinations allocated in the array nimpls and ncombs */
 	int *combs;
 };
 

+ 2 - 0
src/core/perfmodel/perfmodel.h

@@ -94,6 +94,8 @@ void _starpu_simgrid_get_platform_path(char *path, size_t maxlen);
 
 struct starpu_perfmodel_arch * _starpu_arch_comb_get(int comb);
 
+void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
+
 #ifdef __cplusplus
 }
 #endif

+ 41 - 15
src/core/perfmodel/perfmodel_history.c

@@ -40,7 +40,7 @@
 
 struct starpu_perfmodel_arch **arch_combs;
 int current_arch_comb;
-unsigned nb_arch_combs;
+int nb_arch_combs;
 
 struct starpu_perfmodel_history_table
 {
@@ -58,6 +58,12 @@ static struct _starpu_perfmodel_list *registered_models = NULL;
 
 int starpu_add_arch_comb(int ndevices, struct starpu_perfmodel_device* devices)
 {
+	if (current_arch_comb >= nb_arch_combs)
+	{
+		// We need to allocate more arch_combs
+		nb_arch_combs += 10;
+		arch_combs = (struct starpu_perfmodel_arch**) realloc(arch_combs, nb_arch_combs*sizeof(struct starpu_perfmodel_arch*));
+	}
 	arch_combs[current_arch_comb] = (struct starpu_perfmodel_arch*)malloc(sizeof(struct starpu_perfmodel_arch));
 	arch_combs[current_arch_comb]->devices = (struct starpu_perfmodel_device*)malloc(ndevices*sizeof(struct starpu_perfmodel_device));
 	arch_combs[current_arch_comb]->ndevices = ndevices;
@@ -401,6 +407,7 @@ static enum starpu_worker_archtype _get_enum_type(int type)
 	}
 
 }
+
 static void parse_comb(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, int comb)
 {
 	int ndevices = 0;
@@ -459,19 +466,11 @@ static void parse_model_file(FILE *f, struct starpu_perfmodel *model, unsigned s
 
 	if (ncombs > nb_arch_combs)
 	{
-		int i;
-
-		arch_combs = (struct starpu_perfmodel_arch**) realloc(arch_combs, ncombs*sizeof(struct starpu_perfmodel_arch*));
-		model->per_arch = (struct starpu_perfmodel_per_arch**) realloc(model->per_arch, ncombs*sizeof(struct starpu_perfmodel_per_arch*));
-		model->nimpls = (int *)realloc(model->nimpls, ncombs*sizeof(int));
-		model->combs = (int*)realloc(model->combs, ncombs*sizeof(int));
-
-		for(i = ncombs; i < nb_arch_combs; i++)
-		{
-			model->per_arch[i] = NULL;
-			model->nimpls[i] = 0;
-		}
+		// The model has more combs than the original number of arch_combs, we need to reallocate
 		nb_arch_combs = ncombs;
+		arch_combs = (struct starpu_perfmodel_arch**) realloc(arch_combs, nb_arch_combs*sizeof(struct starpu_perfmodel_arch*));
+
+		_starpu_perfmodel_realloc(model, nb_arch_combs);
 	}
 
 	int comb;
@@ -574,6 +573,22 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 	}
 }
 
+void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb)
+{
+	int i;
+
+	STARPU_ASSERT(nb > model->ncombs_set);
+	model->per_arch = (struct starpu_perfmodel_per_arch**) realloc(model->per_arch, nb*sizeof(struct starpu_perfmodel_per_arch*));
+	model->nimpls = (int *)realloc(model->nimpls, nb*sizeof(int));
+	model->combs = (int*)realloc(model->combs, nb*sizeof(int));
+	for(i = model->ncombs_set; i < nb; i++)
+	{
+		model->per_arch[i] = NULL;
+		model->nimpls[i] = 0;
+	}
+	model->ncombs_set = nb;
+}
+
 void starpu_perfmodel_init(FILE *f, struct starpu_perfmodel *model)
 {
 	STARPU_ASSERT(model && model->symbol);
@@ -601,12 +616,13 @@ void starpu_perfmodel_init(FILE *f, struct starpu_perfmodel *model)
 	STARPU_PTHREAD_RWLOCK_INIT(&model->model_rwlock, NULL);
 	if(model->type != STARPU_COMMON)
 	{
-		unsigned i;
+		int i;
 
 		model->per_arch = (struct starpu_perfmodel_per_arch**) malloc(nb_arch_combs*sizeof(struct starpu_perfmodel_per_arch*));
 		model->nimpls = (int *)malloc(nb_arch_combs*sizeof(int));
 		model->combs = (int*)malloc(nb_arch_combs*sizeof(int));
 		model->ncombs = 0;
+		model->ncombs_set = nb_arch_combs;
 
 		for(i = 0; i < nb_arch_combs; i++)
 		{
@@ -748,7 +764,10 @@ void _starpu_initialize_registered_performance_models(void)
 	for(i = 0; i < conf->topology.nhwmicdevices; i++)
 		nmic += conf->topology.nhwmiccores[i];
 	unsigned nscc = conf->topology.nhwscc;
-	nb_arch_combs = pow(2, (ncores + ncuda + nopencl + nmic + nscc));
+
+	// We used to allocate 2**(ncores + ncuda + nopencl + nmic + nscc), this is too big
+	// We now allocate only 2*(ncores + ncuda + nopencl + nmic + nscc), and reallocate when necessary in starpu_add_arch_comb
+	nb_arch_combs = 2 * (ncores + ncuda + nopencl + nmic + nscc);
 	arch_combs = (struct starpu_perfmodel_arch**) malloc(nb_arch_combs*sizeof(struct starpu_perfmodel_arch*));
 	current_arch_comb = 0;
 }
@@ -1194,7 +1213,14 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			}
 		}
 		if(!found)
+		{
+			if (model->ncombs + 1 >= model->ncombs_set)
+			{
+				// The number of combinations is bigger than the one which was initially allocated, we need to reallocate
+				_starpu_perfmodel_realloc(model, nb_arch_combs);
+			}
 			model->combs[model->ncombs++] = comb;
+		}
 
 		STARPU_PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);