Procházet zdrojové kódy

some magma tests with ctxs
--This lie, and those below, will be ignored--

A magma_tests
A magma_tests/time_zpotrf_tile.c
A magma_tests/timing.c

Andra Hugo před 13 roky
rodič
revize
7b41dd31e0
2 změnil soubory, kde provedl 961 přidání a 0 odebrání
  1. 214 0
      magma_tests/time_zpotrf_tile.c
  2. 747 0
      magma_tests/timing.c

+ 214 - 0
magma_tests/time_zpotrf_tile.c

@@ -0,0 +1,214 @@
+/**
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+#define _TYPE  PLASMA_Complex64_t
+#define _PREC  double
+#define _LAMCH LAPACKE_dlamch_work
+
+#define _NAME  "PLASMA_zpotrf_Tile"
+/* See Lawn 41 page 120 */
+#define _FMULS (n * (1.0 / 6.0 * n + 0.5) * n)
+#define _FADDS (n * (1.0 / 6.0 * n )      * n)
+
+#include "./timing.c"
+
+int first = 1;
+pthread_mutex_t mut;
+void* start_Test(void *p)
+{
+	PLASMA_enum uplo = ((params*)p)->uplo;
+	magma_desc_t *descA = ((params*)p)->descA;
+
+	unsigned ctx = ((params*)p)->ctx;
+	unsigned the_other_ctx = ((params*)p)->the_other_ctx;
+
+	if(ctx != 0)
+		starpu_set_sched_ctx(&ctx);
+
+	if(ctx == 1)
+	{
+		int i, j;
+		int sum = 0;
+		for(i = 0; i < 1000; i++)
+			for(j = 0; j < 100; j++)
+			{
+				sum += i;
+				printf("sum = %d\n", sum);
+			}
+	}
+	real_Double_t t;
+	((params*)p)->t = -cWtime();
+	MAGMA_zpotrf_Tile(uplo, descA);
+	((params*)p)->t += cWtime();
+
+	printf("require stop resize\n");
+	sched_ctx_hypervisor_stop_resize(the_other_ctx);
+/* 	if(ctx != 0) */
+/*         { */
+/*                 pthread_mutex_lock(&mut); */
+/*                 if(first){ */
+/*                         starpu_delete_sched_ctx(ctx, the_other_ctx); */
+/*                 } */
+
+/*                 first = 0; */
+/*                 pthread_mutex_unlock(&mut); */
+/*         } */
+
+
+	return p;
+}
+
+
+static magma_desc_t* do_start_stuff(int *iparam, int n, PLASMA_Complex64_t *A, PLASMA_Complex64_t *AT) 
+{
+    PLASMA_Complex64_t *b, *bT, *x;
+    real_Double_t       t;
+    magma_desc_t       *descA = NULL;
+    int nb, nt;
+    int nrhs  = iparam[TIMING_NRHS];
+    int check = iparam[TIMING_CHECK];
+    int nocpu = iparam[TIMING_NO_CPU];
+    int lda = n;
+    int ldb = n;
+
+    int peak_profiling = iparam[TIMING_PEAK];
+    int profiling      = iparam[TIMING_PROFILE];
+
+    nb  = iparam[TIMING_NB];
+    nt  = n / nb + ((n % nb == 0) ? 0 : 1);
+    
+    /* Allocate Data */
+    AT = (PLASMA_Complex64_t *)malloc(lda*n*sizeof(PLASMA_Complex64_t));
+
+    /* Check if unable to allocate memory */
+    if ( !AT ){
+        printf("Out of Memory \n ");
+        exit(0);
+    }
+
+    /* Initialiaze Data */
+    MAGMA_Desc_Create(&descA, AT, PlasmaComplexDouble, nb, nb, nb*nb, lda, n, 0, 0, n, n);
+    MAGMA_zplghe_Tile((double)n, descA, 51 );
+
+    /* Save AT in lapack layout for check */
+    if ( check ) {
+        A = (PLASMA_Complex64_t *)malloc(lda*n    *sizeof(PLASMA_Complex64_t));
+        MAGMA_zTile_to_Lapack( descA, (void*)A, n);
+    }
+
+    if ( profiling | peak_profiling )
+        MAGMA_Enable( MAGMA_PROFILING_MODE );
+
+    if (nocpu)
+        morse_zlocality_allrestrict( MAGMA_CUDA );
+    return descA;
+
+}
+
+static void do_end_stuff(int *iparam, double *dparam, magma_desc_t *descA, int n, PLASMA_enum uplo,
+	PLASMA_Complex64_t *A, PLASMA_Complex64_t *AT)
+{
+    PLASMA_Complex64_t *b, *bT, *x;
+    real_Double_t       t;
+    magma_desc_t       *descB = NULL;
+    int nb, nt;
+    int nrhs  = iparam[TIMING_NRHS];
+    int check = iparam[TIMING_CHECK];
+    int nocpu = iparam[TIMING_NO_CPU];
+    int lda = n;
+    int ldb = n;
+
+    int peak_profiling = iparam[TIMING_PEAK];
+    int profiling      = iparam[TIMING_PROFILE];
+
+    if (nocpu)
+        morse_zlocality_allrestore();
+
+    if ( profiling | peak_profiling )
+        MAGMA_Disable( MAGMA_PROFILING_MODE );
+
+    /* Check the solution */
+    if ( check )
+      {
+        b  = (PLASMA_Complex64_t *)malloc(ldb*nrhs*sizeof(PLASMA_Complex64_t));
+        bT = (PLASMA_Complex64_t *)malloc(ldb*nrhs*sizeof(PLASMA_Complex64_t));
+        x  = (PLASMA_Complex64_t *)malloc(ldb*nrhs*sizeof(PLASMA_Complex64_t));
+
+        LAPACKE_zlarnv_work(1, ISEED, ldb*nrhs, bT);
+        MAGMA_Desc_Create(&descB, bT, PlasmaComplexDouble, nb, nb, nb*nb, ldb, nrhs, 0, 0, n, nrhs);
+        MAGMA_zTile_to_Lapack(descB, (void*)b, n);
+
+        MAGMA_zpotrs_Tile( uplo, descA, descB);
+        MAGMA_zTile_to_Lapack(descB, (void*)x, n);
+
+        dparam[TIMING_RES] = zcheck_solution(n, n, nrhs, A, lda, b, x, ldb,
+                                             &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), 
+                                             &(dparam[TIMING_XNORM]));
+        MAGMA_Desc_Destroy(&descB);
+        free( A );
+        free( b );
+        free( bT );
+        free( x );
+      }
+
+    MAGMA_Desc_Destroy(&descA);
+    free(AT);
+
+    if (peak_profiling) {
+        real_Double_t peak = 0;
+        /*estimate_zgemm_sustained_peak(&peak);*/
+        dparam[TIMING_ESTIMATED_PEAK] = (double)peak;
+    }
+    
+    if (profiling)
+    {
+        /* Profiling of the scheduler */
+        morse_schedprofile_display();
+        /* Profile of each kernel */
+        morse_zdisplay_allprofile();
+    }
+}
+
+static int
+RunTest(int *iparam, double *dparam, real_Double_t *t_) 
+{
+	PLASMA_Complex64_t *A1, *AT1, *A2, *AT2;
+	int n1     = iparam[TIMING_N];
+	int n2     = iparam[TIMING_N2];
+	magma_desc_t       *descA1 = NULL;
+	magma_desc_t       *descA2 = NULL;
+	PLASMA_enum uplo1 = PlasmaLower;
+	PLASMA_enum uplo2 = PlasmaLower;
+	
+	descA1 = do_start_stuff(iparam, n1, A1, AT1);
+	descA2 = do_start_stuff(iparam, n2, A2, AT2);
+	
+	pthread_t tid[2];
+
+	p1.uplo = uplo1;
+	p1.descA = descA1;
+
+	p2.uplo = uplo2;
+	p2.descA = descA2;
+
+        pthread_mutex_init(&mut, NULL);
+
+	pthread_create(&tid[0], NULL, (void*)start_Test, (void*)&p1);
+	pthread_create(&tid[1], NULL, (void*)start_Test, (void*)&p2);
+
+	pthread_join(tid[0], &p1);
+	pthread_join(tid[1], &p2);
+
+	pthread_mutex_destroy(&mut);
+
+	t1[0] = p1.t;
+	t2[0] = p2.t;
+
+        printf("t1 = %lf t2 = %lf \n", t1[0], t2[0]);
+
+	do_end_stuff(iparam, dparam1, descA1, n1, uplo1, A1, AT1);
+	do_end_stuff(iparam, dparam2, descA2, n2, uplo2, A2, AT2);
+    return 0;
+}

+ 747 - 0
magma_tests/timing.c

@@ -0,0 +1,747 @@
+/**
+ *
+ * @file time_main.c
+ *
+ *  PLASMA auxiliary routines
+ *  PLASMA is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 2.3.1
+ * @author ???
+ * @author Mathieu Faverge
+ * @date 2010-11-15
+ *
+ **/
+
+/* Define these so that the Microsoft VC compiler stops complaining
+   about scanf and friends */
+#define _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef PLASMA_EZTRACE
+#include <eztrace.h>
+#endif
+
+#if defined( _WIN32 ) || defined( _WIN64 )
+#include <windows.h>
+#include <time.h>
+#include <sys/timeb.h>
+#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
+#define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64
+#else
+#define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL
+#endif
+
+struct timezone
+{
+    int  tz_minuteswest; /* minutes W of Greenwich */
+    int  tz_dsttime;     /* type of dst correction */
+};
+
+int gettimeofday(struct timeval* tv, struct timezone* tz)
+{
+    FILETIME         ft;
+    unsigned __int64 tmpres = 0;
+    static int       tzflag;
+
+    if (NULL != tv)
+        {
+            GetSystemTimeAsFileTime(&ft);
+            tmpres |=  ft.dwHighDateTime;
+            tmpres <<= 32;
+            tmpres |=  ft.dwLowDateTime;
+
+            /*converting file time to unix epoch*/
+            tmpres /= 10;  /*convert into microseconds*/
+            tmpres -= DELTA_EPOCH_IN_MICROSECS;
+
+            tv->tv_sec  = (long)(tmpres / 1000000UL);
+            tv->tv_usec = (long)(tmpres % 1000000UL);
+        }
+    if (NULL != tz)
+        {
+            if (!tzflag)
+                {
+                    _tzset();
+                    tzflag++;
+                }
+            tz->tz_minuteswest = _timezone / 60;
+            tz->tz_dsttime     = _daylight;
+        }
+    return 0;
+}
+
+#else  /* Non-Windows */
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#include <cblas.h>
+#include <lapacke.h>
+#include <plasma.h>
+#include <core_blas.h>
+#include <magma_morse.h>
+#include <sched_ctx_hypervisor.h>
+#include "timing.h"
+#include "auxiliary.h"
+#include <pthread.h>
+
+static int RunTest(int *iparam, _PREC *dparam, double *t_);
+
+double cWtime(void);
+
+int ISEED[4] = {0,0,0,1};   /* initial seed for zlarnv() */
+
+/*
+ * struct timeval {time_t tv_sec; suseconds_t tv_usec;};
+ */
+double cWtime(void)
+{
+    struct timeval tp;
+    gettimeofday( &tp, NULL );
+    return tp.tv_sec + 1e-6 * tp.tv_usec;
+}
+
+double       *t1, *t2;
+_PREC         dparam1[TIMING_DNBPARAM];
+_PREC         dparam2[TIMING_DNBPARAM];
+
+static int
+Test(int64_t n, int *iparam) {
+    int           i, j, iter, m;
+    int thrdnbr, niter, nrhs;
+    double       *t;
+    _PREC         eps = _LAMCH( 'e' );
+    _PREC         dparam[TIMING_DNBPARAM];
+    double        flops, fmuls, fadds, fp_per_mul, fp_per_add;
+    double        sumgf, sumgf2, sumt, sd, gflops;
+    double        flops_2, fmuls_2, fadds_2;
+    double        sumgf_2, sumgf2_2, sumt_2, sd_2, gflops_2;
+    
+    char         *s;
+    char         *env[] = {
+        "OMP_NUM_THREADS",
+        "MKL_NUM_THREADS",
+        "GOTO_NUM_THREADS",
+        "ACML_NUM_THREADS",
+        "ATLAS_NUM_THREADS",
+        "BLAS_NUM_THREADS", ""
+    };
+    int gnuplot = 0;
+
+    thrdnbr = iparam[TIMING_THRDNBR];
+    niter   = iparam[TIMING_NITER];
+    nrhs    = iparam[TIMING_NRHS];
+
+    if (n < 0 || thrdnbr < 0) {
+        const char *bound_header = iparam[TIMING_BOUND] ? " thGflop/s" : "";
+        const char *check_header = iparam[TIMING_CHECK] ? "   ||Ax-b||       ||A||       ||x||       ||b||         eps ||Ax-b||/N/eps/(||A||||x||+||b||)" : "";
+        const char *peak_header = iparam[TIMING_PEAK] ? "    (\% of peak)  peak" : "";
+
+        printf( "#   N NRHS threads seconds   Gflop/s Deviation        %s%s%s\n", bound_header, peak_header, check_header);
+
+        if (gnuplot) {
+            printf( "set title '%d_NUM_THREADS: ", thrdnbr );
+            for (i = 0; env[i][0]; ++i) {
+                s = getenv( env[i] );
+
+                if (i) printf( " " ); /* separating space */
+
+                for (j = 0; j < 5 && env[i][j] && env[i][j] != '_'; ++j)
+                    printf( "%c", env[i][j] );
+
+                if (s)
+                    printf( "=%s", s );
+                else
+                    printf( "->%s", "?" );
+            }
+            printf( "'\n" );
+            printf( "%s\n%s\n%s\n%s\n%s%s%s\n",
+                    "set xlabel 'Matrix size'",
+                    "set ylabel 'Gflop/s'",
+                    "set key bottom",
+                    gnuplot > 1 ? "set terminal png giant\nset output 'timeplot.png'" : "",
+                    "plot '-' using 1:5 title '", _NAME, "' with linespoints" );
+        }
+
+        return 0;
+    }
+
+    printf( "%5d %4d %5d ", iparam[TIMING_N], iparam[TIMING_NRHS], iparam[TIMING_THRDNBR] );
+    printf( "%5d %4d %5d ", iparam[TIMING_N2], iparam[TIMING_NRHS], iparam[TIMING_THRDNBR] );
+    fflush( stdout );
+
+    t = (double*)malloc(niter*sizeof(double));
+    memset(t, 0, niter*sizeof(double));
+
+    t1 = (double*)malloc(niter*sizeof(double));
+    memset(t, 0, niter*sizeof(double));
+
+    t2 = (double*)malloc(niter*sizeof(double));
+    memset(t, 0, niter*sizeof(double));
+
+    if (sizeof(_TYPE) == sizeof(_PREC)) {
+        fp_per_mul = 1;
+        fp_per_add = 1;
+    } else {
+        fp_per_mul = 6;
+        fp_per_add = 2;
+    }
+
+    m = iparam[TIMING_M];
+    n = iparam[TIMING_N];
+    fadds = _FADDS;
+    fmuls = _FMULS;
+    flops = fmuls * fp_per_mul + fadds * fp_per_add;
+    gflops = 0.0;
+
+    m = iparam[TIMING_M2];
+    n = iparam[TIMING_N2];
+    fadds_2 = _FADDS;
+    fmuls_2 = _FMULS;
+    flops_2 = fmuls_2 * fp_per_mul + fadds_2 * fp_per_add;
+    gflops_2 = 0.0;
+
+    if ( iparam[TIMING_WARMUP] ) {
+        RunTest( iparam, dparam, &(t[0]));
+    }
+
+    sumgf  = 0.0;
+    double sumgf_upper  = 0.0;
+    sumgf2 = 0.0;
+    sumt   = 0.0;
+
+    sumgf_2  = 0.0;
+    double sumgf_upper_2  = 0.0;
+    sumgf2_2 = 0.0;
+    sumt_2   = 0.0;
+    
+    for (iter = 0; iter < niter; iter++)
+    {
+
+#ifdef PLASMA_EZTRACE
+        if( iter == 0 ) {
+            eztrace_start();
+            RunTest( iparam, dparam, &(t[iter]));
+            eztrace_stop();
+        }
+        else
+#endif
+            RunTest( iparam, dparam, &(t[iter]));
+
+        double tmin = 0.0;
+        double integer_tmin = 0.0;
+        double upper_gflops = 0.0;
+
+        double tmin_2 = 0.0;
+        double integer_tmin_2 = 0.0;
+        double upper_gflops_2 = 0.0;
+
+#if 0
+        if (iparam[TIMING_BOUND])
+        {
+            if (iparam[TIMING_BOUNDDEPS]) {
+                FILE *out = fopen("bounddeps.pl", "w");
+                starpu_bound_print_lp(out);
+                fclose(out);
+                out = fopen("bound.dot", "w");
+                starpu_bound_print_dot(out);
+                fclose(out);
+            } else {
+#if 0
+                FILE *out = fopen("bound.pl", "w");
+                starpu_bound_print_lp(out);
+                fclose(out);
+#endif
+                starpu_bound_compute(&tmin, &integer_tmin, 0);
+                upper_gflops  = ((1e-6 * flops) / tmin);
+                starpu_bound_compute(&tmin_2, &integer_tmin_2, 0);
+                upper_gflops_2  = ((1e-6 * flops_2) / tmin_2);
+
+            }
+        }
+#endif
+	printf("t1 = %lf t2 = %lf \n", t1[0], t2[0]);
+        gflops  = (1e-9 * flops) / t1[iter];
+        sumt   += t1[iter];
+        sumgf_upper += upper_gflops;
+        sumgf  += gflops;
+        sumgf2 += gflops*gflops;
+
+        gflops_2  = (1e-9 * flops_2) / t2[iter];
+        sumt_2   += t2[iter];
+        sumgf_upper_2 += upper_gflops_2;
+        sumgf_2  += gflops_2;
+        sumgf2_2 += gflops_2*gflops_2;
+
+    }
+
+    gflops = sumgf / niter;
+    sd = sqrt((sumgf2 - (sumgf*sumgf)/niter)/niter);
+
+    gflops_2 = sumgf_2 / niter;
+    sd_2 = sqrt((sumgf2_2 - (sumgf_2*sumgf_2)/niter)/niter);
+
+    printf( "%9.3f %9.2f +-%7.2f  ", sumt/niter, gflops, sd);
+
+    if (iparam[TIMING_BOUND] && !iparam[TIMING_BOUNDDEPS])
+        printf(" %9.2f",  sumgf_upper/niter);
+
+    if ( iparam[TIMING_PEAK] )
+    {
+       if (dparam1[TIMING_ESTIMATED_PEAK]<0.0f)
+         printf("  n/a    n/a   ");
+       else
+         printf("  %2.2f\%%  %9.2f ", 100.0f*(gflops/dparam1[TIMING_ESTIMATED_PEAK]), dparam1[TIMING_ESTIMATED_PEAK]);
+    }
+
+    if ( iparam[TIMING_CHECK] )
+        printf( "%8.5e %8.5e %8.5e %8.5e %8.5e %8.5e",
+                dparam1[TIMING_RES], dparam1[TIMING_ANORM], dparam1[TIMING_XNORM], dparam1[TIMING_BNORM], eps, 
+                dparam1[TIMING_RES] / n / eps / (dparam1[TIMING_ANORM] * dparam1[TIMING_XNORM] + dparam1[TIMING_BNORM] ));
+    printf("\n");
+
+    printf( "%9.3f %9.2f +-%7.2f  ", sumt_2/niter, gflops_2, sd_2);
+
+    if (iparam[TIMING_BOUND] && !iparam[TIMING_BOUNDDEPS])
+        printf(" %9.2f",  sumgf_upper_2/niter);
+
+    if ( iparam[TIMING_PEAK] )
+    {
+       if (dparam2[TIMING_ESTIMATED_PEAK]<0.0f)
+         printf("  n/a    n/a   ");
+       else
+         printf("  %2.2f\%%  %9.2f ", 100.0f*(gflops_2/dparam2[TIMING_ESTIMATED_PEAK]), dparam2[TIMING_ESTIMATED_PEAK]);
+    }
+
+    if ( iparam[TIMING_CHECK] )
+        printf( "%8.5e %8.5e %8.5e %8.5e %8.5e %8.5e",
+                dparam2[TIMING_RES], dparam2[TIMING_ANORM], dparam2[TIMING_XNORM], dparam2[TIMING_BNORM], eps, 
+                dparam2[TIMING_RES] / n / eps / (dparam2[TIMING_ANORM] * dparam2[TIMING_XNORM] + dparam2[TIMING_BNORM] ));
+    printf("\n");
+
+    fflush( stdout );
+    free(t);
+    free(t1);
+    free(t2);
+
+    return 0;
+}
+
+static int
+startswith(const char *s, const char *prefix) {
+    size_t n = strlen( prefix );
+    if (strncmp( s, prefix, n ))
+        return 0;
+    return 1;
+}
+
+static int
+get_range(char *range, int *start_p, int *stop_p, int *step_p) {
+    char *s, *s1, buf[21];
+    int colon_count, copy_len, nbuf=20, n;
+    int start=1000, stop=10000, step=1000;
+
+    colon_count = 0;
+    for (s = strchr( range, ':'); s; s = strchr( s+1, ':'))
+        colon_count++;
+
+    if (colon_count == 0) { /* No colon in range. */
+        if (sscanf( range, "%d", &start ) < 1 || start < 1)
+            return -1;
+        step = start / 10;
+        if (step < 1) step = 1;
+        stop = start + 10 * step;
+
+    } else if (colon_count == 1) { /* One colon in range.*/
+        /* First, get the second number (after colon): the stop value. */
+        s = strchr( range, ':' );
+        if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1)
+            return -1;
+
+        /* Next, get the first number (before colon): the start value. */
+        n = s - range;
+        copy_len = n > nbuf ? nbuf : n;
+        strncpy( buf, range, copy_len );
+        buf[copy_len] = 0;
+        if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1)
+            return -1;
+
+        /* Let's have 10 steps or less. */
+        step = (stop - start) / 10;
+        if (step < 1)
+            step = 1;
+    } else if (colon_count == 2) { /* Two colons in range. */
+        /* First, get the first number (before the first colon): the start value. */
+        s = strchr( range, ':' );
+        n = s - range;
+        copy_len = n > nbuf ? nbuf : n;
+        strncpy( buf, range, copy_len );
+        buf[copy_len] = 0;
+	if(copy_len == 0)
+	  start = 0;
+        else if (sscanf( buf, "%d", &start ) < 1 || start < 1)
+            return -1;
+
+        /* Next, get the second number (after the first colon): the stop value. */
+        s1 = strchr( s+1, ':' );
+        n = s1 - (s + 1);
+        copy_len = n > nbuf ? nbuf : n;
+        strncpy( buf, s+1, copy_len );
+        buf[copy_len] = 0;
+
+	if(copy_len == 0)
+	  stop = 0;
+        else if (sscanf( buf, "%d", &stop ) < 1 || stop < start)
+            return -1;
+
+        /* Finally, get the third number (after the second colon): the step value. */
+        if (sscanf( s1+1, "%d", &step ) < 1 || step < 1)
+            return -1;
+    } else
+
+        return -1;
+
+    *start_p = start;
+    *stop_p = stop;
+    *step_p = step;
+
+    return 0;
+}
+
+static void
+show_help(char *prog_name) {
+    printf( "Usage:\n%s [options]\n\n", prog_name );
+    printf( "Options are:\n" );
+    printf( "  --threads=C    Number of threads (default: 1)\n" );
+    printf( "  --n_range=R    Range of N values: Start:Stop:Step (default: 500:5000:500)\n" );
+    //    printf( "  --gnuplot      produce output suitable for gnuplot" );
+    printf( "  --[no]check    Check result (default: nocheck)\n" );
+    printf( "  --[no]warmup   Perform a warmup run to pre-load libraries (default: warmup)\n");
+    printf( "  --parallel=N   Use parallel tasks of size N (default: no)\n");
+    printf( "  --niter=N      Number of iterations (default: 1)\n");
+    printf( "  --nb=N         Nb size. Not used if autotuning is activated (default: 128)\n");
+    printf( "  --ib=N         IB size. Not used if autotuning is activated (default: 32)\n");
+    printf( "  --nrhs=N       Number of right-hand size (default: 1)\n");
+    printf( "  --[no]dyn      Activate Dynamic scheduling (default: nodyn)\n");
+    printf( "  --[no]atun     Activate autotuning (default: noatun)\n");
+    printf( "  --ifmt         Input format. 0: CM, 1: CCRB, 2: CRRB, 3: RCRB, 4: RRRB, 5: RM (default: 0)\n");
+    printf( "  --ofmt         Output format. 0: CM, 1: CCRB, 2: CRRB, 3: RCRB, 4: RRRB, 5: RM (default: 1)\n");
+    printf( "  --thrdbypb     Number of threads per subproblem for inplace transformation (default: 1)\n");
+    printf( "  --[no]profile  Profile kernels with StarPU (default: no)\n");
+    printf( "  --[no]peak     Evalue sustained peak performance (default: no)\n");
+}
+static void
+get_thread_count(int *thrdnbr) {
+#if defined WIN32 || defined WIN64
+    sscanf( getenv( "NUMBER_OF_PROCESSORS" ), "%d", thrdnbr );
+#else
+    *thrdnbr = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+}
+
+typedef struct {
+        PLASMA_enum uplo;
+        magma_desc_t *descA;
+        unsigned ctx;
+        unsigned the_other_ctx;
+	real_Double_t t;
+} params;
+
+double compute_flops(int n, int m)
+{
+	double fp_per_mul, fp_per_add;
+	if (sizeof(_TYPE) == sizeof(_PREC)) {
+		fp_per_mul = 1;
+		fp_per_add = 1;
+	} else {
+		fp_per_mul = 6;
+		fp_per_add = 2;
+	}
+	
+	double fmuls = (n * (1.0 / 6.0 * n + 0.5) * n);
+	double fadds = (n * (1.0 / 6.0 * n ) * n);
+	double flops = fmuls * fp_per_mul + fadds * fp_per_add;
+	return flops;
+}
+params p1, p2;
+int
+main(int argc, char *argv[]) {
+    int i;
+    int start =  500;
+    int stop  = 5000;
+    int step  =  500;
+
+    int start1 =  500;
+    int stop1  = 5000;
+    int step1  =  500;
+
+    int start2 =  500;
+    int stop2  = 5000;
+    int step2  =  500;
+
+    int start_cpus1 =  0, start_cpus2 = 0, start_gpus1 = 0, start_gpus2 = 0;
+    int stop_cpus1  = -1, stop_cpus2  = -1, stop_gpus1 = -1, stop_gpus2 = -1;
+    int step_cpus1  =  1, step_cpus2 = 1, step_gpus1 = 1, step_gpus2 = 1;
+
+    int iparam[TIMING_INBPARAM];
+
+    memset(iparam, 0, TIMING_INBPARAM*sizeof(int));
+
+    iparam[TIMING_CHECK         ] = 0;
+    iparam[TIMING_WARMUP        ] = 1;
+    iparam[TIMING_NITER         ] = 1;
+    iparam[TIMING_N             ] = 500;
+    iparam[TIMING_N2            ] = 500;
+    iparam[TIMING_NB            ] = 128;
+    iparam[TIMING_IB            ] = 32;
+    iparam[TIMING_NRHS          ] = 1;
+    iparam[TIMING_THRDNBR       ] = 1;
+    iparam[TIMING_NCUDAS        ] = 0;
+    iparam[TIMING_THRDNBR_SUBGRP] = 1;
+    iparam[TIMING_SCHEDULER     ] = 0;
+    iparam[TIMING_AUTOTUNING    ] = 1;
+    iparam[TIMING_INPUTFMT      ] = 0;
+    iparam[TIMING_OUTPUTFMT     ] = 0;
+    iparam[TIMING_NDOM          ] = 1;
+    iparam[TIMING_PROFILE       ] = 0;
+    iparam[TIMING_PEAK          ] = 0;
+    iparam[TIMING_PARALLEL_TASKS] = 0;
+    iparam[TIMING_NO_CPU        ] = 0;
+    iparam[TIMING_BOUND         ] = 0;
+    iparam[TIMING_BOUNDDEPS     ] = 0;
+    iparam[TIMING_BOUNDDEPSPRIO ] = 0;
+    iparam[TIMING_WITH_CTXS     ] = 1;
+
+    get_thread_count( &(iparam[TIMING_THRDNBR]) );
+
+    for (i = 1; i < argc && argv[i]; ++i) {
+        if (startswith( argv[i], "--help" )) {
+            show_help( argv[0] );
+            return EXIT_SUCCESS;
+        } else if (startswith( argv[i], "--n_cpus1=" )) {
+            get_range( strchr( argv[i], '=' ) + 1, &start_cpus1, &stop_cpus1, &step_cpus1 );
+        } else if (startswith( argv[i], "--n_cpus2=" )) {
+            get_range( strchr( argv[i], '=' ) + 1, &start_cpus2, &stop_cpus2, &step_cpus2 );
+        } else if (startswith( argv[i], "--n_gpus1=" )) {
+            get_range( strchr( argv[i], '=' ) + 1, &start_gpus1, &stop_gpus1, &step_gpus1 );
+        } else if (startswith( argv[i], "--n_gpus2=" )) {
+            get_range( strchr( argv[i], '=' ) + 1, &start_gpus2, &stop_gpus2, &step_gpus2 );
+        } else if (startswith( argv[i], "--n_range=" )) {
+            get_range( strchr( argv[i], '=' ) + 1, &start, &stop, &step );
+        } else if (startswith( argv[i], "--n_range1=" )) {
+            get_range( strchr( argv[i], '=' ) + 1, &start1, &stop1, &step1 );
+	} else if (startswith( argv[i], "--n_range2=" )) {
+            get_range( strchr( argv[i], '=' ) + 1, &start2, &stop2, &step2 );
+        } else if (startswith( argv[i], "--threads=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_THRDNBR]) );
+        /* } else if (startswith( argv[i], "--gnuplot-png" )) { */
+        /*     gnuplot = 2; */
+        /* } else if (startswith( argv[i], "--gnuplot" )) { */
+        /*     gnuplot = 1; */
+        } else if (startswith( argv[i], "--noctxs" )) {
+            iparam[TIMING_WITH_CTXS] = 0;
+        } else if (startswith( argv[i], "--check" )) {
+            iparam[TIMING_CHECK] = 1;
+        } else if (startswith( argv[i], "--nocheck" )) {
+            iparam[TIMING_CHECK] = 0;
+        } else if (startswith( argv[i], "--warmup" )) {
+            iparam[TIMING_WARMUP] = 1;
+        } else if (startswith( argv[i], "--nowarmup" )) {
+            iparam[TIMING_WARMUP] = 0;
+        } else if (startswith( argv[i], "--dyn" )) {
+            iparam[TIMING_SCHEDULER] = 1;
+        } else if (startswith( argv[i], "--nodyn" )) {
+            iparam[TIMING_SCHEDULER] = 0;
+        } else if (startswith( argv[i], "--atun" )) {
+            iparam[TIMING_AUTOTUNING] = 1;
+        } else if (startswith( argv[i], "--noatun" )) {
+            iparam[TIMING_AUTOTUNING] = 0;
+        } else if (startswith( argv[i], "--profile" )) {
+            iparam[TIMING_PROFILE] = 1;
+        } else if (startswith( argv[i], "--peak" )) {
+            iparam[TIMING_PEAK] = 1;
+        } else if (startswith( argv[i], "--noprofile" )) {
+            iparam[TIMING_PROFILE] = 0;
+        } else if (startswith( argv[i], "--parallel=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_PARALLEL_TASKS]) );
+        } else if (startswith( argv[i], "--noparallel" )) {
+            iparam[TIMING_PARALLEL_TASKS] = 0;
+        } else if (startswith( argv[i], "--nocpu" )) {
+            iparam[TIMING_NO_CPU] = 1;
+        } else if (startswith( argv[i], "--nb=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_NB]) );
+        } else if (startswith( argv[i], "--m=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_M]) );
+        } else if (startswith( argv[i], "--ib=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_IB]) );
+        } else if (startswith( argv[i], "--nrhs=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_NRHS]) );
+        } else if (startswith( argv[i], "--ifmt=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_INPUTFMT]) );
+        } else if (startswith( argv[i], "--ofmt=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_OUTPUTFMT]) );
+        } else if (startswith( argv[i], "--thrdbypb=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_THRDNBR_SUBGRP]) );
+        } else if (startswith( argv[i], "--niter=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &iparam[TIMING_NITER] );
+        } else if (startswith( argv[i], "--ndom=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &iparam[TIMING_NDOM] );
+        } else if (startswith( argv[i], "--bounddepsprio" )) {
+                iparam[TIMING_BOUND] = 1;
+                iparam[TIMING_BOUNDDEPS] = 1;
+                iparam[TIMING_BOUNDDEPSPRIO] = 1;
+        } else if (startswith( argv[i], "--bounddeps" )) {
+                iparam[TIMING_BOUND] = 1;
+                iparam[TIMING_BOUNDDEPS] = 1;
+        } else if (startswith( argv[i], "--bound" )) {
+                iparam[TIMING_BOUND] = 1;
+        } else {
+            fprintf( stderr, "Unknown option: %s\n", argv[i] );
+        }
+    }
+    if (step < 1) step = 1;
+    if (step1 < 1) step1 = 1;
+    if (step2 < 1) step2 = 1;
+
+    /* TODO : correct into plasma */
+    if ( iparam[TIMING_IB] > iparam[TIMING_NB] )
+      iparam[TIMING_IB] = iparam[TIMING_NB];
+
+    /* TODO */
+    if (iparam[TIMING_PARALLEL_TASKS]) {
+        MAGMA_InitPar(iparam[TIMING_THRDNBR]/iparam[TIMING_PARALLEL_TASKS], 
+                      iparam[TIMING_NCUDAS],
+                      iparam[TIMING_PARALLEL_TASKS]);
+    }
+    else {
+        MAGMA_Init( iparam[TIMING_THRDNBR],
+                    iparam[TIMING_NCUDAS]);
+        
+    }
+
+    MAGMA_Disable(MAGMA_AUTOTUNING);
+    MAGMA_Set(MAGMA_TILE_SIZE,        iparam[TIMING_NB] );
+    MAGMA_Set(MAGMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] );
+
+    if(iparam[TIMING_WITH_CTXS])
+    {
+	    int nprocs1 = (stop_cpus1 - start_cpus1 + 1)/step_cpus1 + (stop_gpus1 - start_gpus1 + 1)/step_gpus1;
+	    int nprocs2 = (stop_cpus2 - start_cpus2 + 1)/step_cpus2 + (stop_gpus2 - start_gpus2 + 1)/step_gpus2;
+	    int procs1[nprocs1];
+	    int procs2[nprocs2];
+	    int i, j = 0;
+	    printf("%d: ", nprocs1);
+	    for (i = start_gpus1; i <= stop_gpus1; i += step_gpus1)
+	    {
+		    printf("%d ", i);
+		    procs1[j++] = i;
+	    }
+	    
+	    for (i = start_cpus1; i <= stop_cpus1; i += step_cpus1)
+	    {
+		    printf("%d ", i);
+		    procs1[j++] = i;
+	    }
+	    printf("\n");
+	    
+	    printf("%d: ", nprocs2);
+	    j = 0;
+	    for (i = start_gpus2; i <= stop_gpus2; i += step_gpus2)
+	    {
+		    printf("%d ", i);
+		    procs2[j++] = i;
+	    }
+	    
+	    for (i = start_cpus2; i <= stop_cpus2; i += step_cpus2)
+	    {
+		    printf("%d ", i);
+		    procs2[j++] = i;
+	    }
+	    printf("\n");
+	    
+	    struct starpu_sched_ctx_hypervisor_criteria *criteria = sched_ctx_hypervisor_init(SIMPLE_POLICY);
+	    p1.ctx = starpu_create_sched_ctx_with_criteria("heft", procs1, nprocs1, "sched_ctx1", &criteria);
+	    
+	    p2.ctx = starpu_create_sched_ctx_with_criteria("heft", procs2, nprocs2, "sched_ctx2", &criteria);
+
+/* 	    p1.ctx = starpu_create_sched_ctx("heft", procs1, nprocs1, "sched_ctx1"); */
+	    
+/* 	    p2.ctx = starpu_create_sched_ctx("heft", procs2, nprocs2, "sched_ctx2"); */
+
+	    double flops1 = compute_flops(start1, start1);
+	    double flops2 = compute_flops(start2, start2);
+	    printf("flops1 = %lf flops2 = %lf\n", flops1, flops2);
+	    sched_ctx_hypervisor_handle_ctx(p1.ctx, compute_flops(start1, start1));
+
+	    sched_ctx_hypervisor_handle_ctx(p2.ctx, compute_flops(start2, start2));
+	    
+	    p1.the_other_ctx = p2.ctx;
+	    p2.the_other_ctx = p1.ctx;
+	    
+	    int procs[12];
+	    for(i = 0; i < 12; i++)
+		    procs[i] = i;
+
+	    int gpus[3];
+	    for(i = 0; i < 3; i++)
+		    gpus[i] = i;
+	    sched_ctx_hypervisor_ioctl(p1.ctx,
+				       HYPERVISOR_GRANULARITY, 2,
+				       HYPERVISOR_MIN_TASKS, 10,
+				       HYPERVISOR_MIN_WORKERS, 3,
+				       HYPERVISOR_MAX_WORKERS, 12,
+				       HYPERVISOR_FIXED_WORKERS, gpus, 3,
+//				       HYPERVISOR_MAX_IDLE, procs, 12, 40000.0,
+//				       HYPERVISOR_MAX_IDLE, gpus, 3, 10000.0,
+				       NULL);
+	    
+	    sched_ctx_hypervisor_ioctl(p2.ctx,
+				       HYPERVISOR_GRANULARITY, 2,
+				       HYPERVISOR_MIN_TASKS, 10,
+				       HYPERVISOR_MIN_WORKERS, 0,
+				       HYPERVISOR_MAX_WORKERS, 12,
+				       HYPERVISOR_FIXED_WORKERS, gpus, 3,
+//				       HYPERVISOR_MAX_IDLE, procs, 12, 40000.0,
+//				       HYPERVISOR_MAX_IDLE, gpus, 3, 10000.0,
+				       NULL);
+	    
+    }	
+    else
+    {
+	    p1.ctx = 0;
+	    p2.ctx = 0;
+    }
+    
+    Test( -1, iparam ); /* print header */
+
+    iparam[TIMING_N] = start1;
+    iparam[TIMING_N2] = start2;
+    
+    if ( iparam[TIMING_M] == 0 )
+	    iparam[TIMING_M] = iparam[TIMING_N];
+
+    if ( iparam[TIMING_M2] == 0 )
+	    iparam[TIMING_M2] = iparam[TIMING_N2];
+
+    Test( start1, iparam );    
+    
+    MAGMA_Finalize();
+
+    if(iparam[TIMING_WITH_CTXS])
+	    sched_ctx_hypervisor_shutdown();
+    
+    /* if (gnuplot) { */
+    /*         printf( "%s\n%s\n", */
+    /*                 "e", */
+    /*                 gnuplot > 1 ? "" : "pause 10" ); */
+    /* } */
+
+    return EXIT_SUCCESS;
+}