Andra Hugo 14 years ago
parent
commit
6e0a58abff

+ 204 - 20
examples/cholesky_and_lu/cholesky_and_lu.c

@@ -7,14 +7,12 @@ typedef struct {
   char **argv;
 } params;
 
-#define NSAMPLES 10
+#define NSAMPLES 20
+
 struct starpu_sched_ctx sched_ctx;
 struct starpu_sched_ctx sched_ctx2;
-
-/*conf bonne
-  int procs[] = {0, 2, 3, 4, 11};
-  int procs[] = {1, 5, 6, 7, 8, 9, 10};
-*/
+struct starpu_sched_ctx sched_ctx3;
+struct starpu_sched_ctx sched_ctx4;
 
 void* func_cholesky(void *val){
   params *p = (params*)val;
@@ -71,39 +69,66 @@ void* func_cholesky3(void *val){
 void* func_lu(void *val){
   params *p = (params*)val;
 
-  int procs2[] = {0, 4, 5, 6, 7, 8, 9, 10, 11};
-  starpu_create_sched_ctx(&sched_ctx2, "heft", procs2, 9, "lu");
+  int procs2[] = {0, 7, 8, 9, 10, 11};
+  starpu_create_sched_ctx(&sched_ctx3, "heft", procs2, 6, "lu");
 
   int i;
   double *flops = (double*)malloc(sizeof(double));
   (*flops) = 0;
   for(i = 0; i < NSAMPLES; i++)
     {
-      printf("%d ", i);
-      (*flops) += run_lu(&sched_ctx2, p->argc, p->argv);
+      (*flops) += run_lu(&sched_ctx3, p->argc, p->argv);
     }
 
   (*flops) /= NSAMPLES;
   return (void*)flops;
 }
 
-int main(int argc, char **argv)
-{
-  params p;
-  p.argc = argc;
-  p.argv = argv;
+void* func_lu2(void *val){
+  params *p = (params*)val;
+
+  int procs2[] = {1, 2, 3, 4, 5, 6};
+  starpu_create_sched_ctx(&sched_ctx4, "heft", procs2, 6, "lu2");
+
+  int i;
+  double *flops = (double*)malloc(sizeof(double));
+  (*flops) = 0;
+  for(i = 0; i < NSAMPLES; i++)
+    {
+      (*flops) += run_lu(&sched_ctx4, p->argc, p->argv);
+    }
 
+  (*flops) /= NSAMPLES;
+  return (void*)flops;
+}
+
+void* func_lu3(void *val){
+  params *p = (params*)val;
+
+  int i;
+  double *flops = (double*)malloc(sizeof(double));
+  (*flops) = 0;
+  for(i = 0; i < NSAMPLES; i++)
+    {
+      (*flops) += run_lu(NULL, p->argc, p->argv);
+    }
+
+  (*flops) /= NSAMPLES;
+  return (void*)flops;
+}
+
+void cholesky_vs_cholesky(params *p){
+  /* 2 cholesky in different ctxs */
   starpu_init(NULL);
   starpu_helper_cublas_init();
 
   pthread_t tid[2];
 
-  pthread_create(&tid[0], NULL, (void*)func_cholesky, (void*)&p);
-  pthread_create(&tid[1], NULL, (void*)func_cholesky2, (void*)&p);
+  pthread_create(&tid[0], NULL, (void*)func_cholesky, (void*)p);
+  pthread_create(&tid[1], NULL, (void*)func_cholesky2, (void*)p);
 
   void *gflops_cholesky1;
   void *gflops_cholesky2;
-  //  void *gflops_lu = func_lu(&p);
  
   pthread_join(tid[0], &gflops_cholesky1);
   pthread_join(tid[1], &gflops_cholesky2);
@@ -111,15 +136,174 @@ int main(int argc, char **argv)
   starpu_helper_cublas_shutdown();
   starpu_shutdown();
 
+  /* 1 cholesky all alone on the whole machine */
+  starpu_init(NULL);
+  starpu_helper_cublas_init();
+
+  void *gflops_cholesky3 = func_cholesky3(p);
+
+  starpu_helper_cublas_shutdown();
+  starpu_shutdown();
+
+
+  /* 2 cholesky in a single ctx */
+  starpu_init(NULL);
+  starpu_helper_cublas_init();
+
+  pthread_t tid2[2];
+
+  pthread_create(&tid2[0], NULL, (void*)func_cholesky3, (void*)p);
+  pthread_create(&tid2[1], NULL, (void*)func_cholesky3, (void*)p);
+
+  void *gflops_cholesky4;
+  void *gflops_cholesky5;
+ 
+  pthread_join(tid2[0], &gflops_cholesky4);
+  pthread_join(tid2[1], &gflops_cholesky5);
+
+  starpu_helper_cublas_shutdown();
+  starpu_shutdown();
+
+  printf("%2.2f %2.2f %2.2f %2.2f %2.2f\n", *((double*)gflops_cholesky1), *((double*)gflops_cholesky2), *((double*)gflops_cholesky3), *((double*)gflops_cholesky4), *((double*)gflops_cholesky5));
+
+  free(gflops_cholesky1);
+  free(gflops_cholesky2);
+  free(gflops_cholesky3);
+  free(gflops_cholesky4);
+  free(gflops_cholesky5);
+}
+
+void cholesky_vs_lu(params *p){
+  /* one cholesky and one lu each one in its own context */
   starpu_init(NULL);
   starpu_helper_cublas_init();
 
-  void *gflops_cholesky3 = func_cholesky3(&p);
+  pthread_t tid[2];
+
+  pthread_create(&tid[0], NULL, (void*)func_cholesky, (void*)p);
+  pthread_create(&tid[1], NULL, (void*)func_lu, (void*)p);
+
+  void *gflops_cholesky;
+  void *gflops_lu;
+ 
+  pthread_join(tid[0], &gflops_cholesky);
+  pthread_join(tid[1], &gflops_lu);
 
   starpu_helper_cublas_shutdown();
   starpu_shutdown();
 
-  printf("%2.2f %2.2f %2.2f\n", *((double*)gflops_cholesky1), *((double*)gflops_cholesky2), *((double*)gflops_cholesky3));
+  /*one cholesky and one lu mixed in a single context*/
+  starpu_init(NULL);
+  starpu_helper_cublas_init();
+
+  pthread_t tid2[2];
+
+  pthread_create(&tid2[0], NULL, (void*)func_cholesky3, (void*)p);
+  pthread_create(&tid2[1], NULL, (void*)func_lu3, (void*)p);
+
+  void *gflops_cholesky2;
+  void *gflops_lu2;
+ 
+  pthread_join(tid2[0], &gflops_cholesky2);
+  pthread_join(tid2[1], &gflops_lu2);
+
+  starpu_helper_cublas_shutdown();
+  starpu_shutdown();
+
+
+  /* 1 lu all alone on the whole machine */
+  starpu_init(NULL);
+  starpu_helper_cublas_init();
+
+  void *gflops_lu3 = func_lu3(p);
+
+  starpu_helper_cublas_shutdown();
+  starpu_shutdown();
+
+  /* 1 cholesky all alone on the whole machine */
+  starpu_init(NULL);
+  starpu_helper_cublas_init();
+
+  void *gflops_cholesky3 = func_cholesky3(p);
+
+  starpu_helper_cublas_shutdown();
+  starpu_shutdown();
+
+  printf("%2.2f %2.2f %2.2f %2.2f %2.2f %2.2f \n", *((double*)gflops_cholesky), *((double*)gflops_lu), *((double*)gflops_cholesky2), *((double*)gflops_lu2), *((double*)gflops_cholesky3), *((double*)gflops_lu3));
+
+  free(gflops_cholesky);
+  free(gflops_cholesky2);
+  free(gflops_cholesky3);
+  free(gflops_lu);
+  free(gflops_lu2);
+  free(gflops_lu3);
+}
+
+void lu_vs_lu(params *p){
+  /* 2 lu in different ctxs */
+  starpu_init(NULL);
+  starpu_helper_cublas_init();
+
+  pthread_t tid[2];
+
+  pthread_create(&tid[0], NULL, (void*)func_lu, (void*)p);
+  pthread_create(&tid[1], NULL, (void*)func_lu2, (void*)p);
+
+  void *gflops_lu1;
+  void *gflops_lu2;
+ 
+  pthread_join(tid[0], &gflops_lu1);
+  pthread_join(tid[1], &gflops_lu2);
+
+  starpu_helper_cublas_shutdown();
+  starpu_shutdown();
+
+  /* 1 lu all alone on the whole machine */
+  starpu_init(NULL);
+  starpu_helper_cublas_init();
+
+  void *gflops_lu3 = func_lu3(p);
+
+  starpu_helper_cublas_shutdown();
+  starpu_shutdown();
+
+  //   printf("%2.2f\n", *((double*)gflops_lu3));
+  printf("%2.2f %2.2f %2.2f\n", *((double*)gflops_lu1), *((double*)gflops_lu2), *((double*)gflops_lu3));
+
+  /* /\* 2 lu in a single ctx *\/ */
+  /* starpu_init(NULL); */
+  /* starpu_helper_cublas_init(); */
+
+  /* pthread_t tid2[2]; */
+
+  /* pthread_create(&tid2[0], NULL, (void*)func_lu3, (void*)p); */
+  /* pthread_create(&tid2[1], NULL, (void*)func_lu3, (void*)p); */
+
+  /* void *gflops_lu4; */
+  /* void *gflops_lu5; */
+ 
+  /* pthread_join(tid2[0], &gflops_lu4); */
+  /* pthread_join(tid2[1], &gflops_lu5); */
+
+  /* starpu_helper_cublas_shutdown(); */
+  /* starpu_shutdown(); */
+
+  /* printf("%2.2f %2.2f %2.2f %2.2f %2.2f\n", *((double*)gflops_lu1), *((double*)gflops_lu2), *((double*)gflops_lu3), *((double*)gflops_lu4), *((double*)gflops_lu5)); */
+
+  /* free(gflops_lu1); */
+  /* free(gflops_lu2); */
+  /* free(gflops_lu3); */
+  /* free(gflops_lu4); */
+  /* free(gflops_lu5); */
+}
+
+int main(int argc, char **argv)
+{
+  params p;
+  p.argc = argc;
+  p.argv = argv;
+
+  lu_vs_lu(&p);
 
   return 0;
 }

+ 24 - 23
examples/cholesky_and_lu/lu/lu_example.c

@@ -36,11 +36,6 @@ static unsigned bound = 0;
 static unsigned bounddeps = 0;
 static unsigned boundprio = 0;
 
-TYPE *A;
-TYPE *A_saved;
-
-/* in case we use non-strided blocks */
-TYPE **A_blocks;
 
 static void lu_parse_args(int argc, char **argv)
 {
@@ -105,7 +100,7 @@ static void display_matrix(TYPE *m, unsigned n, unsigned ld, char *str)
 #endif
 }
 
-void copy_blocks_into_matrix()
+void copy_blocks_into_matrix(TYPE *A, TYPE **A_blocks)
 {
 	unsigned blocklu_size = (lu_size/lu_nblocks);
 
@@ -127,7 +122,7 @@ void copy_blocks_into_matrix()
 
 
 
-void copy_matrix_into_blocks()
+void copy_matrix_into_blocks(TYPE *A, TYPE **A_blocks)
 {
 	unsigned blocklu_size = (lu_size/lu_nblocks);
 
@@ -147,11 +142,11 @@ void copy_matrix_into_blocks()
 	}
 }
 
-static void init_matrix()
+static void init_matrix(TYPE **A)
 {
 	/* allocate matrix */
-	starpu_data_malloc_pinned_if_possible((void **)&A, (size_t)lu_size*lu_size*sizeof(TYPE));
-	STARPU_ASSERT(A);
+	starpu_data_malloc_pinned_if_possible((void **)A, (size_t)lu_size*lu_size*sizeof(TYPE));
+	STARPU_ASSERT(*A);
 
 	starpu_srand48((long int)time(NULL));
 	//	starpu_srand48(0);
@@ -162,13 +157,13 @@ static void init_matrix()
 	{
 		for (i = 0; i < lu_size; i++)
 		{
-		  A[i + j*lu_size] = (TYPE)starpu_drand48();
+		  (*A)[i + j*lu_size] = (TYPE)starpu_drand48();
 		}
 	}
 
 }
 
-static void save_matrix()
+static void save_matrix(TYPE *A, TYPE *A_saved)
 {
 	A_saved = malloc((size_t)lu_size*lu_size*sizeof(TYPE));
 	STARPU_ASSERT(A_saved);
@@ -193,7 +188,7 @@ static double frobenius_norm(TYPE *v, unsigned n)
 	return sqrt(sum2);
 }
 
-static void pivot_saved_matrix(unsigned *ipiv)
+static void pivot_saved_matrix(unsigned *ipiv, TYPE *A_saved)
 {
 	unsigned k;
 	for (k = 0; k < lu_size; k++)
@@ -206,7 +201,7 @@ static void pivot_saved_matrix(unsigned *ipiv)
 	}
 }
 
-static void lu_check_result()
+static void lu_check_result(TYPE *A, TYPE *A_saved)
 {
 	unsigned i,j;
 	TYPE *L, *U;
@@ -265,18 +260,23 @@ static void lu_check_result()
 
 double run_lu(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
 {
-  printf("enter lu\n");
+	TYPE *A;
+	TYPE *A_saved;
+
+	/* in case we use non-strided blocks */
+	TYPE **A_blocks;
+
 	lu_parse_args(argc, argv);
 
 	//	starpu_init(NULL);
 
 	//	starpu_helper_cublas_init();
 
-	init_matrix();
+	init_matrix(&A);
 
 	unsigned *ipiv;
 	if (lu_check)
-	  save_matrix();
+	  save_matrix(A, A_saved);
 
 	display_matrix(A, lu_size, lu_size, "A");
 
@@ -295,11 +295,11 @@ double run_lu(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
 		{
 			/* in case the LU decomposition uses non-strided blocks, we _copy_ the matrix into smaller blocks */
 			A_blocks = malloc(lu_nblocks*lu_nblocks*sizeof(TYPE **));
-			copy_matrix_into_blocks();
+			copy_matrix_into_blocks(A, A_blocks);
 
 			gflops = STARPU_LU(lu_decomposition_pivot_no_stride)(A_blocks, ipiv, lu_size, lu_size, lu_nblocks, sched_ctx);
 
-			copy_blocks_into_matrix();
+			copy_blocks_into_matrix(A, A_blocks);
 			free(A_blocks);
 		}
 		else 
@@ -322,8 +322,7 @@ double run_lu(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
 	}
 	else
 	{
-	  gflops = STARPU_LU(lu_decomposition)(A, lu_size, lu_size, lu_nblocks, sched_ctx);
-			printf("no pivot \n");
+		gflops = STARPU_LU(lu_decomposition)(A, lu_size, lu_size, lu_nblocks, sched_ctx);
 	}
 
 	if (profile)
@@ -349,9 +348,9 @@ double run_lu(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
 	if (lu_check)
 	{
 		if (pivot)
-		  pivot_saved_matrix(ipiv);
+		  pivot_saved_matrix(ipiv, A_saved);
 
-		lu_check_result();
+		lu_check_result(A, A_saved);
 	}
 
 	//	starpu_helper_cublas_shutdown();
@@ -360,3 +359,5 @@ double run_lu(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
 
 	return gflops;
 }
+
+

+ 1 - 1
examples/cholesky_and_lu/lu/xlu.c

@@ -54,7 +54,7 @@ static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 
 	struct starpu_task *task = create_task(TAG11(k));
 
-	task->cl = &cl11;
+[6~	task->cl = &cl11;
 
 	/* which sub-data is manipulated ? */
 	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);

+ 4 - 0
examples/cholesky_and_lu/lu/xlu.h

@@ -114,4 +114,8 @@ double STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsig
 double STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks, struct starpu_sched_ctx *sched_ctx);
 double STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks, struct starpu_sched_ctx *sched_ctx);
 
+/* double STARPU_LU(lu_decomposition_all_machine)(TYPE *matA, unsigned size, unsigned ld, unsigned nblocks); */
+/* double STARPU_LU(lu_decomposition_pivot_no_stride_all_machine)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks); */
+/* double STARPU_LU(lu_decomposition_pivot_all_machine)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks); */
+
 #endif // __XLU_H__

+ 21 - 7
examples/cholesky_and_lu/lu/xlu_implicit.c

@@ -34,7 +34,10 @@ static void create_task_11(starpu_data_handle dataA, unsigned k, struct starpu_s
 	if (!no_prio)
 		task->priority = STARPU_MAX_PRIO;
 
-	starpu_task_submit_to_ctx(task, sched_ctx);
+	if(sched_ctx == NULL)
+	  starpu_task_submit(task);
+	else
+	  starpu_task_submit_to_ctx(task, sched_ctx);
 }
 
 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned j, struct starpu_sched_ctx *sched_ctx)
@@ -51,7 +54,10 @@ static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned j, str
 	if (!no_prio && (j == k+1))
 		task->priority = STARPU_MAX_PRIO;
 
-	starpu_task_submit_to_ctx(task, sched_ctx);
+	if(sched_ctx == NULL)
+	  starpu_task_submit(task);
+	else
+	  starpu_task_submit_to_ctx(task, sched_ctx);
 }
 
 static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned i, struct starpu_sched_ctx *sched_ctx)
@@ -69,7 +75,10 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned i, str
 	if (!no_prio && (i == k+1))
 		task->priority = STARPU_MAX_PRIO;
 
-	starpu_task_submit_to_ctx(task, sched_ctx);
+	if(sched_ctx == NULL)
+	  starpu_task_submit(task);
+	else
+	  starpu_task_submit_to_ctx(task, sched_ctx);
 }
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, struct starpu_sched_ctx *sched_ctx)
@@ -89,7 +98,10 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 	if (!no_prio &&  (i == k + 1) && (j == k +1) )
 		task->priority = STARPU_MAX_PRIO;
 
-	starpu_task_submit_to_ctx(task, sched_ctx);
+	if(sched_ctx == NULL)
+	  starpu_task_submit(task);
+	else
+	  starpu_task_submit_to_ctx(task, sched_ctx);
 }
 
 /*
@@ -108,7 +120,7 @@ static double dw_codelet_facto_v3(starpu_data_handle dataA, unsigned lu_nblocks,
 
 	for (k = 0; k < lu_nblocks; k++)
 	{
-	  create_task_11(dataA, k, sched_ctx);
+		create_task_11(dataA, k, sched_ctx);
 		
 		for (i = k+1; i<lu_nblocks; i++)
 		{
@@ -123,8 +135,10 @@ static double dw_codelet_facto_v3(starpu_data_handle dataA, unsigned lu_nblocks,
 	}
 
 	/* stall the application until the end of computations */
-	//starpu_task_wait_for_all();
-	starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
+	if(sched_ctx == NULL)
+		starpu_task_wait_for_all();
+	else
+		starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
 
 	gettimeofday(&end, NULL);
 

+ 1 - 0
examples/cholesky_and_lu/lu/xlu_kernels.c

@@ -61,6 +61,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
 			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
+			printf("dx = %d dy = %d dz = %d ld21 = %d ld12= %d ld22 = %d\n");
 
 
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,