Bläddra i källkod

- Provide a LU decomposition with implicit data dependencies
- Factorize the code to define LU's codelets and performance models

Cédric Augonnet 15 år sedan
förälder
incheckning
33cfda1300

+ 20 - 0
examples/Makefile.am

@@ -285,6 +285,26 @@ lu_lu_example_double_SOURCES =			\
 	lu/dlu_kernels.c			\
 	common/blas.c
 
+examplebin_PROGRAMS += 				\
+	lu/lu_implicit_example_float		\
+	lu/lu_implicit_example_double
+
+lu_lu_implicit_example_float_SOURCES =		\
+	lu/lu_example_float.c			\
+	lu/slu_implicit.c			\
+	lu/slu_implicit_pivot.c			\
+	lu/slu_kernels.c			\
+	common/blas.c
+
+lu_lu_implicit_example_double_SOURCES =		\
+	lu/lu_example_double.c			\
+	lu/dlu_implicit.c			\
+	lu/dlu_implicit_pivot.c			\
+	lu/dlu_kernels.c			\
+	common/blas.c
+
+
+
 endif
 
 

+ 18 - 0
examples/lu/dlu_implicit.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "xlu_implicit.c"

+ 18 - 0
examples/lu/dlu_implicit_pivot.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "xlu_implicit_pivot.c"

+ 18 - 0
examples/lu/slu_implicit.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "xlu_implicit.c"

+ 18 - 0
examples/lu/slu_implicit_pivot.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "xlu_implicit_pivot.c"

+ 3 - 84
examples/lu/xlu.c

@@ -46,27 +46,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 	return task;
 }
 
-static struct starpu_perfmodel_t STARPU_LU(model_11) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_11_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_11_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_11)
-#endif
-};
-
-static starpu_codelet cl11 = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_u11),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_u11),
-#endif
-	.nbuffers = 1,
-	.model = &STARPU_LU(model_11)
-};
-
 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 {
 //	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
@@ -91,27 +70,6 @@ static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 	return task;
 }
 
-static struct starpu_perfmodel_t STARPU_LU(model_12) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_12_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_12_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_12)
-#endif
-};
-
-static starpu_codelet cl12 = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_u12),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_u12),
-#endif
-	.nbuffers = 2,
-	.model = &STARPU_LU(model_12)
-};
-
 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned j)
 {
 //	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
@@ -141,27 +99,6 @@ static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned j)
 	starpu_submit_task(task);
 }
 
-static struct starpu_perfmodel_t STARPU_LU(model_21) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_21_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_21_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_21)
-#endif
-};
-
-static starpu_codelet cl21 = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_u21),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_u21),
-#endif
-	.nbuffers = 2,
-	.model = &STARPU_LU(model_21)
-};
-
 static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned i)
 {
 	struct starpu_task *task = create_task(TAG21(k, i));
@@ -189,27 +126,6 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned i)
 	starpu_submit_task(task);
 }
 
-static struct starpu_perfmodel_t STARPU_LU(model_22) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_22_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_22_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_22)
-#endif
-};
-
-static starpu_codelet cl22 = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_u22),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_u22),
-#endif
-	.nbuffers = 3,
-	.model = &STARPU_LU(model_22)
-};
-
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
 {
 //	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
@@ -315,6 +231,9 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_register_matrix_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 
+	/* We already enforce deps by hand */
+	starpu_data_set_sequential_consistency_flag(dataA, 0);
+
 	starpu_filter f;
 		f.filter_func = starpu_vertical_block_filter_func;
 		f.filter_arg = nblocks;

+ 164 - 0
examples/lu/xlu_implicit.c

@@ -0,0 +1,164 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "xlu.h"
+#include "xlu_kernels.h"
+
+static unsigned no_prio = 0;
+
+static void create_task_11(starpu_data_handle dataA, unsigned k)
+{
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_get_sub_data(dataA, 2, k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	if (!no_prio)
+		task->priority = STARPU_MAX_PRIO;
+
+	starpu_submit_task(task);
+}
+
+static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned j)
+{
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &cl12;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = starpu_get_sub_data(dataA, 2, j, k); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!no_prio && (j == k+1))
+		task->priority = STARPU_MAX_PRIO;
+
+	starpu_submit_task(task);
+}
+
+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned i)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl21;
+	
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = starpu_get_sub_data(dataA, 2, k, i); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!no_prio && (i == k+1))
+		task->priority = STARPU_MAX_PRIO;
+
+	starpu_submit_task(task);
+}
+
+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = starpu_get_sub_data(dataA, 2, k, i);
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = starpu_get_sub_data(dataA, 2, j, k);
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = starpu_get_sub_data(dataA, 2, j, i);
+	task->buffers[2].mode = STARPU_RW;
+
+	if (!no_prio &&  (i == k + 1) && (j == k +1) )
+		task->priority = STARPU_MAX_PRIO;
+
+	starpu_submit_task(task);
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	gettimeofday(&start, NULL);
+
+	for (k = 0; k < nblocks; k++)
+	{
+		create_task_11(dataA, k);
+		
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(dataA, k, i);
+			create_task_21(dataA, k, i);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(dataA, k, i, j);
+			}
+		}
+	}
+
+	/* stall the application until the end of computations */
+	starpu_wait_all_tasks();
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_matrix_nx(dataA);
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_register_matrix_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+	dw_codelet_facto_v3(dataA, nblocks);
+
+	/* gather all the data */
+	starpu_unpartition_data(dataA, 0);
+}

+ 292 - 0
examples/lu/xlu_implicit_pivot.c

@@ -0,0 +1,292 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "xlu.h"
+#include "xlu_kernels.h"
+
+static unsigned no_prio = 0;
+
+/*
+ *	Construct the DAG
+ */
+
+static void create_task_pivot(starpu_data_handle *dataAp, unsigned nblocks,
+					struct piv_s *piv_description,
+					unsigned k, unsigned i,
+					starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl_pivot;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, i);
+	task->buffers[0].mode = STARPU_RW;
+
+	task->cl_arg = &piv_description[k];
+
+	/* this is an important task */
+	if (!no_prio && (i == k+1))
+		task->priority = STARPU_MAX_PRIO;
+
+	starpu_submit_task(task);
+}
+
+static void create_task_11_pivot(starpu_data_handle *dataAp, unsigned nblocks,
+					unsigned k, struct piv_s *piv_description,
+					starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl11_pivot;
+
+	task->cl_arg = &piv_description[k];
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	if (!no_prio)
+		task->priority = STARPU_MAX_PRIO;
+
+	starpu_submit_task(task);
+}
+
+static void create_task_12(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned j,
+		starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct starpu_task *task = starpu_task_create();
+	
+	task->cl = &cl12;
+
+	task->cl_arg = (void *)(task->tag_id);
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, k);
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_block(dataAp, nblocks, j, k);
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!no_prio && (j == k+1)) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	starpu_submit_task(task);
+}
+
+static void create_task_21(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned i,
+				starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl21;
+	
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_block(dataAp, nblocks, k, i); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!no_prio && (i == k+1)) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	task->cl_arg = (void *)(task->tag_id);
+
+	starpu_submit_task(task);
+}
+
+static void create_task_22(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned i, unsigned j,
+				starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl22;
+
+	task->cl_arg = (void *)(task->tag_id);
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, i);
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_block(dataAp, nblocks, j, k);
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = get_block(dataAp, nblocks, j, i);
+	task->buffers[2].mode = STARPU_RW;
+
+	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	starpu_submit_task(task);
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+static double dw_codelet_facto_pivot(starpu_data_handle *dataAp,
+					struct piv_s *piv_description,
+					unsigned nblocks,
+					starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct timeval start;
+	struct timeval end;
+
+	struct starpu_task *entry_task = NULL;
+
+	gettimeofday(&start, NULL);
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+	for (k = 0; k < nblocks; k++)
+	{
+		create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
+
+		for (i = 0; i < nblocks; i++)
+		{
+			if (i != k)
+				create_task_pivot(dataAp, nblocks, piv_description, k, i, get_block);
+		}
+	
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(dataAp, nblocks, k, i, get_block);
+			create_task_21(dataAp, nblocks, k, i, get_block);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(dataAp, nblocks, k, i, j, get_block);
+			}
+		}
+	}
+
+	/* stall the application until the end of computations */
+	starpu_wait_all_tasks();
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	return timing;
+}
+
+starpu_data_handle get_block_with_striding(starpu_data_handle *dataAp,
+			unsigned nblocks __attribute__((unused)), unsigned j, unsigned i)
+{
+	/* we use filters */
+	return starpu_get_sub_data(*dataAp, 2, j, i);
+}
+
+
+void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_register_matrix_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+	unsigned i;
+	for (i = 0; i < size; i++)
+		ipiv[i] = i;
+
+	struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s));
+	unsigned block;
+	for (block = 0; block < nblocks; block++)
+	{
+		piv_description[block].piv = ipiv;
+		piv_description[block].first = block * (size / nblocks);
+		piv_description[block].last = (block + 1) * (size / nblocks);
+	}
+
+	double timing;
+	timing = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding);
+
+	fprintf(stderr, "Computation took (in ms)\n");
+	fprintf(stderr, "%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_matrix_nx(dataA);
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+	/* gather all the data */
+	starpu_unpartition_data(dataA, 0);
+}
+
+
+starpu_data_handle get_block_with_no_striding(starpu_data_handle *dataAp, unsigned nblocks, unsigned j, unsigned i)
+{
+	/* dataAp is an array of data handle */
+	return dataAp[i+j*nblocks];
+}
+
+void STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle));
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		starpu_register_matrix_data(&dataAp[bi+nblocks*bj], 0,
+			(uintptr_t)matA[bi+nblocks*bj], size/nblocks,
+			size/nblocks, size/nblocks, sizeof(TYPE));
+	}
+
+	unsigned i;
+	for (i = 0; i < size; i++)
+		ipiv[i] = i;
+
+	struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s));
+	unsigned block;
+	for (block = 0; block < nblocks; block++)
+	{
+		piv_description[block].piv = ipiv;
+		piv_description[block].first = block * (size / nblocks);
+		piv_description[block].last = (block + 1) * (size / nblocks);
+	}
+
+	double timing;
+	timing = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding);
+
+	fprintf(stderr, "Computation took (in ms)\n");
+	fprintf(stderr, "%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_matrix_nx(dataAp[0])*nblocks;
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		starpu_delete_data(dataAp[bi+nblocks*bj]);
+	}
+}

+ 131 - 2
examples/lu/xlu_kernels.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,10 @@
 #include "xlu.h"
 #include <math.h>
 
+#define str(s) #s
+#define xstr(s)        str(s)
+#define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
+
 /*
  *   U22 
  */
@@ -81,6 +85,27 @@ void STARPU_LU(cublas_u22)(void *descr[], void *_args)
 }
 #endif// STARPU_USE_CUDA
 
+static struct starpu_perfmodel_t STARPU_LU(model_22) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_LU_STR(lu_model_22_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_LU_STR(lu_model_22_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_22)
+#endif
+};
+
+starpu_codelet cl22 = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = STARPU_LU(cpu_u22),
+#ifdef STARPU_USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u22),
+#endif
+	.nbuffers = 3,
+	.model = &STARPU_LU(model_22)
+};
+
 /*
  * U12
  */
@@ -143,6 +168,27 @@ void STARPU_LU(cublas_u12)(void *descr[], void *_args)
 }
 #endif // STARPU_USE_CUDA
 
+static struct starpu_perfmodel_t STARPU_LU(model_12) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_LU_STR(lu_model_12_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_LU_STR(lu_model_12_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_12)
+#endif
+};
+
+starpu_codelet cl12 = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = STARPU_LU(cpu_u12),
+#ifdef STARPU_USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u12),
+#endif
+	.nbuffers = 2,
+	.model = &STARPU_LU(model_12)
+};
+
 /* 
  * U21
  */
@@ -203,6 +249,27 @@ void STARPU_LU(cublas_u21)(void *descr[], void *_args)
 }
 #endif 
 
+static struct starpu_perfmodel_t STARPU_LU(model_21) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_LU_STR(lu_model_21_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_LU_STR(lu_model_21_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_21)
+#endif
+};
+
+starpu_codelet cl21 = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = STARPU_LU(cpu_u21),
+#ifdef STARPU_USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u21),
+#endif
+	.nbuffers = 2,
+	.model = &STARPU_LU(model_21)
+};
+
 /*
  *	U11
  */
@@ -275,6 +342,27 @@ void STARPU_LU(cublas_u11)(void *descr[], void *_args)
 }
 #endif// STARPU_USE_CUDA
 
+static struct starpu_perfmodel_t STARPU_LU(model_11) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_LU_STR(lu_model_11_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_LU_STR(lu_model_11_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_11)
+#endif
+};
+
+starpu_codelet cl11 = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = STARPU_LU(cpu_u11),
+#ifdef STARPU_USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u11),
+#endif
+	.nbuffers = 1,
+	.model = &STARPU_LU(model_11)
+};
+
 /*
  *	U11 with pivoting
  */
@@ -390,6 +478,27 @@ void STARPU_LU(cublas_u11_pivot)(void *descr[], void *_args)
 }
 #endif// STARPU_USE_CUDA
 
+static struct starpu_perfmodel_t STARPU_LU(model_11_pivot) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_LU_STR(lu_model_11_pivot_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_LU_STR(lu_model_11_pivot_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_11_pivot)
+#endif
+};
+
+starpu_codelet cl11_pivot = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = STARPU_LU(cpu_u11_pivot),
+#ifdef STARPU_USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u11_pivot),
+#endif
+	.nbuffers = 1,
+	.model = &STARPU_LU(model_11_pivot)
+};
+
 /*
  *	Pivoting
  */
@@ -452,6 +561,26 @@ void STARPU_LU(cublas_pivot)(void *descr[], void *_args)
 {
 	STARPU_LU(common_pivot)(descr, 1, _args);
 }
-#endif// STARPU_USE_CUDA
 
+static struct starpu_perfmodel_t STARPU_LU(model_pivot) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_LU_STR(lu_model_pivot_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_LU_STR(lu_model_pivot_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_pivot)
+#endif
+};
+
+starpu_codelet cl_pivot = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_func = STARPU_LU(cpu_pivot),
+#ifdef STARPU_USE_CUDA
+	.cuda_func = STARPU_LU(cublas_pivot),
+#endif
+	.nbuffers = 1,
+	.model = &STARPU_LU(model_pivot)
+};
 
+#endif// STARPU_USE_CUDA

+ 7 - 4
examples/lu/xlu_kernels.h

@@ -19,10 +19,6 @@
 
 #include <starpu.h>
 
-#define str(s) #s
-#define xstr(s)        str(s)
-#define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
-
 void STARPU_LU(cpu_pivot)(void *descr[], void *_args);
 void STARPU_LU(cpu_u11_pivot)(void *descr[], void *_args);
 void STARPU_LU(cpu_u11)(void *descr[], void *_args);
@@ -39,4 +35,11 @@ void STARPU_LU(cublas_u21)(void *descr[], void *_args);
 void STARPU_LU(cublas_u22)(void *descr[], void *_args);
 #endif
 
+extern starpu_codelet cl11;
+extern starpu_codelet cl11_pivot;
+extern starpu_codelet cl12;
+extern starpu_codelet cl21;
+extern starpu_codelet cl22;
+extern starpu_codelet cl_pivot;
+
 #endif // __XLU_KERNELS_H__

+ 6 - 117
examples/lu/xlu_pivot.c

@@ -30,9 +30,6 @@
 
 static unsigned no_prio = 0;
 
-
-
-
 /*
  *	Construct the DAG
  */
@@ -48,28 +45,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 	return task;
 }
 
-
-static struct starpu_perfmodel_t STARPU_LU(model_pivot) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_pivot_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_pivot_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_pivot)
-#endif
-};
-
-static starpu_codelet cl_pivot = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_pivot),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_pivot),
-#endif
-	.nbuffers = 1,
-	.model = &STARPU_LU(model_pivot)
-};
-
 static void create_task_pivot(starpu_data_handle *dataAp, unsigned nblocks,
 					struct piv_s *piv_description,
 					unsigned k, unsigned i,
@@ -116,27 +91,6 @@ static void create_task_pivot(starpu_data_handle *dataAp, unsigned nblocks,
 	starpu_submit_task(task);
 }
 
-static struct starpu_perfmodel_t STARPU_LU(model_11_pivot) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_11_pivot_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_11_pivot_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_11_pivot)
-#endif
-};
-
-static starpu_codelet cl11_pivot = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_u11_pivot),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_u11_pivot),
-#endif
-	.nbuffers = 1,
-	.model = &STARPU_LU(model_11_pivot)
-};
-
 static struct starpu_task *create_task_11_pivot(starpu_data_handle *dataAp, unsigned nblocks,
 					unsigned k, struct piv_s *piv_description,
 					starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
@@ -163,27 +117,6 @@ static struct starpu_task *create_task_11_pivot(starpu_data_handle *dataAp, unsi
 	return task;
 }
 
-static struct starpu_perfmodel_t STARPU_LU(model_12) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_12_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_12_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_12)
-#endif
-};
-
-static starpu_codelet cl12 = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_u12),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_u12),
-#endif
-	.nbuffers = 2,
-	.model = &STARPU_LU(model_12)
-};
-
 static void create_task_12(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned j,
 		starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
 {
@@ -219,27 +152,6 @@ static void create_task_12(starpu_data_handle *dataAp, unsigned nblocks, unsigne
 	starpu_submit_task(task);
 }
 
-static struct starpu_perfmodel_t STARPU_LU(model_21) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_21_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_21_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_21)
-#endif
-};
-
-static starpu_codelet cl21 = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_u21),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_u21),
-#endif
-	.nbuffers = 2,
-	.model = &STARPU_LU(model_21)
-};
-
 static void create_task_21(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned i,
 				starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
 {
@@ -261,39 +173,10 @@ static void create_task_21(starpu_data_handle *dataAp, unsigned nblocks, unsigne
 
 	/* enforce dependencies ... */
 	starpu_tag_declare_deps(TAG21(k, i), 1, PIVOT(k, i));
-#if 0
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG21(k, i), 3, TAG11(k), TAG22(k-1, k, i), PIVOT(k, i));
-	}
-	else {
-		starpu_tag_declare_deps(TAG21(k, i), 2, TAG11(k), PIVOT(k, i));
-	}
-#endif
 
 	starpu_submit_task(task);
 }
 
-static struct starpu_perfmodel_t STARPU_LU(model_22) = {
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = STARPU_LU_STR(lu_model_22_atlas)
-#elif defined(STARPU_GOTO)
-	.symbol = STARPU_LU_STR(lu_model_22_goto)
-#else
-	.symbol = STARPU_LU_STR(lu_model_22)
-#endif
-};
-
-static starpu_codelet cl22 = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_LU(cpu_u22),
-#ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_LU(cublas_u22),
-#endif
-	.nbuffers = 3,
-	.model = &STARPU_LU(model_22)
-};
-
 static void create_task_22(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned i, unsigned j,
 				starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
 {
@@ -427,6 +310,9 @@ void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_register_matrix_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 
+	/* We already enforce deps by hand */
+	starpu_data_set_sequential_consistency_flag(dataA, 0);
+
 	starpu_filter f;
 		f.filter_func = starpu_vertical_block_filter_func;
 		f.filter_arg = nblocks;
@@ -493,6 +379,9 @@ void STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, un
 		starpu_register_matrix_data(&dataAp[bi+nblocks*bj], 0,
 			(uintptr_t)matA[bi+nblocks*bj], size/nblocks,
 			size/nblocks, size/nblocks, sizeof(TYPE));
+
+		/* We already enforce deps by hand */
+		starpu_data_set_sequential_consistency_flag(dataAp[bi+nblocks*bj], 0);
 	}
 
 	unsigned i;