|
|
@@ -1,747 +0,0 @@
|
|
|
-/**
|
|
|
- *
|
|
|
- * @file time_main.c
|
|
|
- *
|
|
|
- * PLASMA auxiliary routines
|
|
|
- * PLASMA is a software package provided by Univ. of Tennessee,
|
|
|
- * Univ. of California Berkeley and Univ. of Colorado Denver
|
|
|
- *
|
|
|
- * @version 2.3.1
|
|
|
- * @author ???
|
|
|
- * @author Mathieu Faverge
|
|
|
- * @date 2010-11-15
|
|
|
- *
|
|
|
- **/
|
|
|
-
|
|
|
-/* Define these so that the Microsoft VC compiler stops complaining
|
|
|
- about scanf and friends */
|
|
|
-#define _CRT_SECURE_NO_DEPRECATE
|
|
|
-#define _CRT_SECURE_NO_WARNINGS
|
|
|
-
|
|
|
-#include <math.h>
|
|
|
-#include <stdio.h>
|
|
|
-#include <stdlib.h>
|
|
|
-#include <string.h>
|
|
|
-
|
|
|
-#ifdef PLASMA_EZTRACE
|
|
|
-#include <eztrace.h>
|
|
|
-#endif
|
|
|
-
|
|
|
-#if defined( _WIN32 ) || defined( _WIN64 )
|
|
|
-#include <windows.h>
|
|
|
-#include <time.h>
|
|
|
-#include <sys/timeb.h>
|
|
|
-#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
|
|
|
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
|
|
|
-#else
|
|
|
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
|
|
-#endif
|
|
|
-
|
|
|
-struct timezone
|
|
|
-{
|
|
|
- int tz_minuteswest; /* minutes W of Greenwich */
|
|
|
- int tz_dsttime; /* type of dst correction */
|
|
|
-};
|
|
|
-
|
|
|
-int gettimeofday(struct timeval* tv, struct timezone* tz)
|
|
|
-{
|
|
|
- FILETIME ft;
|
|
|
- unsigned __int64 tmpres = 0;
|
|
|
- static int tzflag;
|
|
|
-
|
|
|
- if (NULL != tv)
|
|
|
- {
|
|
|
- GetSystemTimeAsFileTime(&ft);
|
|
|
- tmpres |= ft.dwHighDateTime;
|
|
|
- tmpres <<= 32;
|
|
|
- tmpres |= ft.dwLowDateTime;
|
|
|
-
|
|
|
- /*converting file time to unix epoch*/
|
|
|
- tmpres /= 10; /*convert into microseconds*/
|
|
|
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
|
|
-
|
|
|
- tv->tv_sec = (long)(tmpres / 1000000UL);
|
|
|
- tv->tv_usec = (long)(tmpres % 1000000UL);
|
|
|
- }
|
|
|
- if (NULL != tz)
|
|
|
- {
|
|
|
- if (!tzflag)
|
|
|
- {
|
|
|
- _tzset();
|
|
|
- tzflag++;
|
|
|
- }
|
|
|
- tz->tz_minuteswest = _timezone / 60;
|
|
|
- tz->tz_dsttime = _daylight;
|
|
|
- }
|
|
|
- return 0;
|
|
|
-}
|
|
|
-
|
|
|
-#else /* Non-Windows */
|
|
|
-#include <unistd.h>
|
|
|
-#include <sys/time.h>
|
|
|
-#include <sys/resource.h>
|
|
|
-#endif
|
|
|
-
|
|
|
-#include <cblas.h>
|
|
|
-#include <lapacke.h>
|
|
|
-#include <plasma.h>
|
|
|
-#include <core_blas.h>
|
|
|
-#include <magma_morse.h>
|
|
|
-#include <sched_ctx_hypervisor.h>
|
|
|
-#include "timing.h"
|
|
|
-#include "auxiliary.h"
|
|
|
-#include <pthread.h>
|
|
|
-
|
|
|
-static int RunTest(int *iparam, _PREC *dparam, double *t_);
|
|
|
-
|
|
|
-double cWtime(void);
|
|
|
-
|
|
|
-int ISEED[4] = {0,0,0,1}; /* initial seed for zlarnv() */
|
|
|
-
|
|
|
-/*
|
|
|
- * struct timeval {time_t tv_sec; suseconds_t tv_usec;};
|
|
|
- */
|
|
|
-double cWtime(void)
|
|
|
-{
|
|
|
- struct timeval tp;
|
|
|
- gettimeofday( &tp, NULL );
|
|
|
- return tp.tv_sec + 1e-6 * tp.tv_usec;
|
|
|
-}
|
|
|
-
|
|
|
-double *t1, *t2;
|
|
|
-_PREC dparam1[TIMING_DNBPARAM];
|
|
|
-_PREC dparam2[TIMING_DNBPARAM];
|
|
|
-
|
|
|
-static int
|
|
|
-Test(int64_t n, int *iparam) {
|
|
|
- int i, j, iter, m;
|
|
|
- int thrdnbr, niter, nrhs;
|
|
|
- double *t;
|
|
|
- _PREC eps = _LAMCH( 'e' );
|
|
|
- _PREC dparam[TIMING_DNBPARAM];
|
|
|
- double flops, fmuls, fadds, fp_per_mul, fp_per_add;
|
|
|
- double sumgf, sumgf2, sumt, sd, gflops;
|
|
|
- double flops_2, fmuls_2, fadds_2;
|
|
|
- double sumgf_2, sumgf2_2, sumt_2, sd_2, gflops_2;
|
|
|
-
|
|
|
- char *s;
|
|
|
- char *env[] = {
|
|
|
- "OMP_NUM_THREADS",
|
|
|
- "MKL_NUM_THREADS",
|
|
|
- "GOTO_NUM_THREADS",
|
|
|
- "ACML_NUM_THREADS",
|
|
|
- "ATLAS_NUM_THREADS",
|
|
|
- "BLAS_NUM_THREADS", ""
|
|
|
- };
|
|
|
- int gnuplot = 0;
|
|
|
-
|
|
|
- thrdnbr = iparam[TIMING_THRDNBR];
|
|
|
- niter = iparam[TIMING_NITER];
|
|
|
- nrhs = iparam[TIMING_NRHS];
|
|
|
-
|
|
|
- if (n < 0 || thrdnbr < 0) {
|
|
|
- const char *bound_header = iparam[TIMING_BOUND] ? " thGflop/s" : "";
|
|
|
- const char *check_header = iparam[TIMING_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| eps ||Ax-b||/N/eps/(||A||||x||+||b||)" : "";
|
|
|
- const char *peak_header = iparam[TIMING_PEAK] ? " (\% of peak) peak" : "";
|
|
|
-
|
|
|
- printf( "# N NRHS threads seconds Gflop/s Deviation %s%s%s\n", bound_header, peak_header, check_header);
|
|
|
-
|
|
|
- if (gnuplot) {
|
|
|
- printf( "set title '%d_NUM_THREADS: ", thrdnbr );
|
|
|
- for (i = 0; env[i][0]; ++i) {
|
|
|
- s = getenv( env[i] );
|
|
|
-
|
|
|
- if (i) printf( " " ); /* separating space */
|
|
|
-
|
|
|
- for (j = 0; j < 5 && env[i][j] && env[i][j] != '_'; ++j)
|
|
|
- printf( "%c", env[i][j] );
|
|
|
-
|
|
|
- if (s)
|
|
|
- printf( "=%s", s );
|
|
|
- else
|
|
|
- printf( "->%s", "?" );
|
|
|
- }
|
|
|
- printf( "'\n" );
|
|
|
- printf( "%s\n%s\n%s\n%s\n%s%s%s\n",
|
|
|
- "set xlabel 'Matrix size'",
|
|
|
- "set ylabel 'Gflop/s'",
|
|
|
- "set key bottom",
|
|
|
- gnuplot > 1 ? "set terminal png giant\nset output 'timeplot.png'" : "",
|
|
|
- "plot '-' using 1:5 title '", _NAME, "' with linespoints" );
|
|
|
- }
|
|
|
-
|
|
|
- return 0;
|
|
|
- }
|
|
|
-
|
|
|
- printf( "%5d %4d %5d ", iparam[TIMING_N], iparam[TIMING_NRHS], iparam[TIMING_THRDNBR] );
|
|
|
- printf( "%5d %4d %5d ", iparam[TIMING_N2], iparam[TIMING_NRHS], iparam[TIMING_THRDNBR] );
|
|
|
- fflush( stdout );
|
|
|
-
|
|
|
- t = (double*)malloc(niter*sizeof(double));
|
|
|
- memset(t, 0, niter*sizeof(double));
|
|
|
-
|
|
|
- t1 = (double*)malloc(niter*sizeof(double));
|
|
|
- memset(t, 0, niter*sizeof(double));
|
|
|
-
|
|
|
- t2 = (double*)malloc(niter*sizeof(double));
|
|
|
- memset(t, 0, niter*sizeof(double));
|
|
|
-
|
|
|
- if (sizeof(_TYPE) == sizeof(_PREC)) {
|
|
|
- fp_per_mul = 1;
|
|
|
- fp_per_add = 1;
|
|
|
- } else {
|
|
|
- fp_per_mul = 6;
|
|
|
- fp_per_add = 2;
|
|
|
- }
|
|
|
-
|
|
|
- m = iparam[TIMING_M];
|
|
|
- n = iparam[TIMING_N];
|
|
|
- fadds = _FADDS;
|
|
|
- fmuls = _FMULS;
|
|
|
- flops = fmuls * fp_per_mul + fadds * fp_per_add;
|
|
|
- gflops = 0.0;
|
|
|
-
|
|
|
- m = iparam[TIMING_M2];
|
|
|
- n = iparam[TIMING_N2];
|
|
|
- fadds_2 = _FADDS;
|
|
|
- fmuls_2 = _FMULS;
|
|
|
- flops_2 = fmuls_2 * fp_per_mul + fadds_2 * fp_per_add;
|
|
|
- gflops_2 = 0.0;
|
|
|
-
|
|
|
- if ( iparam[TIMING_WARMUP] ) {
|
|
|
- RunTest( iparam, dparam, &(t[0]));
|
|
|
- }
|
|
|
-
|
|
|
- sumgf = 0.0;
|
|
|
- double sumgf_upper = 0.0;
|
|
|
- sumgf2 = 0.0;
|
|
|
- sumt = 0.0;
|
|
|
-
|
|
|
- sumgf_2 = 0.0;
|
|
|
- double sumgf_upper_2 = 0.0;
|
|
|
- sumgf2_2 = 0.0;
|
|
|
- sumt_2 = 0.0;
|
|
|
-
|
|
|
- for (iter = 0; iter < niter; iter++)
|
|
|
- {
|
|
|
-
|
|
|
-#ifdef PLASMA_EZTRACE
|
|
|
- if( iter == 0 ) {
|
|
|
- eztrace_start();
|
|
|
- RunTest( iparam, dparam, &(t[iter]));
|
|
|
- eztrace_stop();
|
|
|
- }
|
|
|
- else
|
|
|
-#endif
|
|
|
- RunTest( iparam, dparam, &(t[iter]));
|
|
|
-
|
|
|
- double tmin = 0.0;
|
|
|
- double integer_tmin = 0.0;
|
|
|
- double upper_gflops = 0.0;
|
|
|
-
|
|
|
- double tmin_2 = 0.0;
|
|
|
- double integer_tmin_2 = 0.0;
|
|
|
- double upper_gflops_2 = 0.0;
|
|
|
-
|
|
|
-#if 0
|
|
|
- if (iparam[TIMING_BOUND])
|
|
|
- {
|
|
|
- if (iparam[TIMING_BOUNDDEPS]) {
|
|
|
- FILE *out = fopen("bounddeps.pl", "w");
|
|
|
- starpu_bound_print_lp(out);
|
|
|
- fclose(out);
|
|
|
- out = fopen("bound.dot", "w");
|
|
|
- starpu_bound_print_dot(out);
|
|
|
- fclose(out);
|
|
|
- } else {
|
|
|
-#if 0
|
|
|
- FILE *out = fopen("bound.pl", "w");
|
|
|
- starpu_bound_print_lp(out);
|
|
|
- fclose(out);
|
|
|
-#endif
|
|
|
- starpu_bound_compute(&tmin, &integer_tmin, 0);
|
|
|
- upper_gflops = ((1e-6 * flops) / tmin);
|
|
|
- starpu_bound_compute(&tmin_2, &integer_tmin_2, 0);
|
|
|
- upper_gflops_2 = ((1e-6 * flops_2) / tmin_2);
|
|
|
-
|
|
|
- }
|
|
|
- }
|
|
|
-#endif
|
|
|
- printf("t1 = %lf t2 = %lf \n", t1[0], t2[0]);
|
|
|
- gflops = (1e-9 * flops) / t1[iter];
|
|
|
- sumt += t1[iter];
|
|
|
- sumgf_upper += upper_gflops;
|
|
|
- sumgf += gflops;
|
|
|
- sumgf2 += gflops*gflops;
|
|
|
-
|
|
|
- gflops_2 = (1e-9 * flops_2) / t2[iter];
|
|
|
- sumt_2 += t2[iter];
|
|
|
- sumgf_upper_2 += upper_gflops_2;
|
|
|
- sumgf_2 += gflops_2;
|
|
|
- sumgf2_2 += gflops_2*gflops_2;
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
- gflops = sumgf / niter;
|
|
|
- sd = sqrt((sumgf2 - (sumgf*sumgf)/niter)/niter);
|
|
|
-
|
|
|
- gflops_2 = sumgf_2 / niter;
|
|
|
- sd_2 = sqrt((sumgf2_2 - (sumgf_2*sumgf_2)/niter)/niter);
|
|
|
-
|
|
|
- printf( "%9.3f %9.2f +-%7.2f ", sumt/niter, gflops, sd);
|
|
|
-
|
|
|
- if (iparam[TIMING_BOUND] && !iparam[TIMING_BOUNDDEPS])
|
|
|
- printf(" %9.2f", sumgf_upper/niter);
|
|
|
-
|
|
|
- if ( iparam[TIMING_PEAK] )
|
|
|
- {
|
|
|
- if (dparam1[TIMING_ESTIMATED_PEAK]<0.0f)
|
|
|
- printf(" n/a n/a ");
|
|
|
- else
|
|
|
- printf(" %2.2f\%% %9.2f ", 100.0f*(gflops/dparam1[TIMING_ESTIMATED_PEAK]), dparam1[TIMING_ESTIMATED_PEAK]);
|
|
|
- }
|
|
|
-
|
|
|
- if ( iparam[TIMING_CHECK] )
|
|
|
- printf( "%8.5e %8.5e %8.5e %8.5e %8.5e %8.5e",
|
|
|
- dparam1[TIMING_RES], dparam1[TIMING_ANORM], dparam1[TIMING_XNORM], dparam1[TIMING_BNORM], eps,
|
|
|
- dparam1[TIMING_RES] / n / eps / (dparam1[TIMING_ANORM] * dparam1[TIMING_XNORM] + dparam1[TIMING_BNORM] ));
|
|
|
- printf("\n");
|
|
|
-
|
|
|
- printf( "%9.3f %9.2f +-%7.2f ", sumt_2/niter, gflops_2, sd_2);
|
|
|
-
|
|
|
- if (iparam[TIMING_BOUND] && !iparam[TIMING_BOUNDDEPS])
|
|
|
- printf(" %9.2f", sumgf_upper_2/niter);
|
|
|
-
|
|
|
- if ( iparam[TIMING_PEAK] )
|
|
|
- {
|
|
|
- if (dparam2[TIMING_ESTIMATED_PEAK]<0.0f)
|
|
|
- printf(" n/a n/a ");
|
|
|
- else
|
|
|
- printf(" %2.2f\%% %9.2f ", 100.0f*(gflops_2/dparam2[TIMING_ESTIMATED_PEAK]), dparam2[TIMING_ESTIMATED_PEAK]);
|
|
|
- }
|
|
|
-
|
|
|
- if ( iparam[TIMING_CHECK] )
|
|
|
- printf( "%8.5e %8.5e %8.5e %8.5e %8.5e %8.5e",
|
|
|
- dparam2[TIMING_RES], dparam2[TIMING_ANORM], dparam2[TIMING_XNORM], dparam2[TIMING_BNORM], eps,
|
|
|
- dparam2[TIMING_RES] / n / eps / (dparam2[TIMING_ANORM] * dparam2[TIMING_XNORM] + dparam2[TIMING_BNORM] ));
|
|
|
- printf("\n");
|
|
|
-
|
|
|
- fflush( stdout );
|
|
|
- free(t);
|
|
|
- free(t1);
|
|
|
- free(t2);
|
|
|
-
|
|
|
- return 0;
|
|
|
-}
|
|
|
-
|
|
|
-static int
|
|
|
-startswith(const char *s, const char *prefix) {
|
|
|
- size_t n = strlen( prefix );
|
|
|
- if (strncmp( s, prefix, n ))
|
|
|
- return 0;
|
|
|
- return 1;
|
|
|
-}
|
|
|
-
|
|
|
-static int
|
|
|
-get_range(char *range, int *start_p, int *stop_p, int *step_p) {
|
|
|
- char *s, *s1, buf[21];
|
|
|
- int colon_count, copy_len, nbuf=20, n;
|
|
|
- int start=1000, stop=10000, step=1000;
|
|
|
-
|
|
|
- colon_count = 0;
|
|
|
- for (s = strchr( range, ':'); s; s = strchr( s+1, ':'))
|
|
|
- colon_count++;
|
|
|
-
|
|
|
- if (colon_count == 0) { /* No colon in range. */
|
|
|
- if (sscanf( range, "%d", &start ) < 1 || start < 1)
|
|
|
- return -1;
|
|
|
- step = start / 10;
|
|
|
- if (step < 1) step = 1;
|
|
|
- stop = start + 10 * step;
|
|
|
-
|
|
|
- } else if (colon_count == 1) { /* One colon in range.*/
|
|
|
- /* First, get the second number (after colon): the stop value. */
|
|
|
- s = strchr( range, ':' );
|
|
|
- if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1)
|
|
|
- return -1;
|
|
|
-
|
|
|
- /* Next, get the first number (before colon): the start value. */
|
|
|
- n = s - range;
|
|
|
- copy_len = n > nbuf ? nbuf : n;
|
|
|
- strncpy( buf, range, copy_len );
|
|
|
- buf[copy_len] = 0;
|
|
|
- if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1)
|
|
|
- return -1;
|
|
|
-
|
|
|
- /* Let's have 10 steps or less. */
|
|
|
- step = (stop - start) / 10;
|
|
|
- if (step < 1)
|
|
|
- step = 1;
|
|
|
- } else if (colon_count == 2) { /* Two colons in range. */
|
|
|
- /* First, get the first number (before the first colon): the start value. */
|
|
|
- s = strchr( range, ':' );
|
|
|
- n = s - range;
|
|
|
- copy_len = n > nbuf ? nbuf : n;
|
|
|
- strncpy( buf, range, copy_len );
|
|
|
- buf[copy_len] = 0;
|
|
|
- if(copy_len == 0)
|
|
|
- start = 0;
|
|
|
- else if (sscanf( buf, "%d", &start ) < 1 || start < 1)
|
|
|
- return -1;
|
|
|
-
|
|
|
- /* Next, get the second number (after the first colon): the stop value. */
|
|
|
- s1 = strchr( s+1, ':' );
|
|
|
- n = s1 - (s + 1);
|
|
|
- copy_len = n > nbuf ? nbuf : n;
|
|
|
- strncpy( buf, s+1, copy_len );
|
|
|
- buf[copy_len] = 0;
|
|
|
-
|
|
|
- if(copy_len == 0)
|
|
|
- stop = 0;
|
|
|
- else if (sscanf( buf, "%d", &stop ) < 1 || stop < start)
|
|
|
- return -1;
|
|
|
-
|
|
|
- /* Finally, get the third number (after the second colon): the step value. */
|
|
|
- if (sscanf( s1+1, "%d", &step ) < 1 || step < 1)
|
|
|
- return -1;
|
|
|
- } else
|
|
|
-
|
|
|
- return -1;
|
|
|
-
|
|
|
- *start_p = start;
|
|
|
- *stop_p = stop;
|
|
|
- *step_p = step;
|
|
|
-
|
|
|
- return 0;
|
|
|
-}
|
|
|
-
|
|
|
-static void
|
|
|
-show_help(char *prog_name) {
|
|
|
- printf( "Usage:\n%s [options]\n\n", prog_name );
|
|
|
- printf( "Options are:\n" );
|
|
|
- printf( " --threads=C Number of threads (default: 1)\n" );
|
|
|
- printf( " --n_range=R Range of N values: Start:Stop:Step (default: 500:5000:500)\n" );
|
|
|
- // printf( " --gnuplot produce output suitable for gnuplot" );
|
|
|
- printf( " --[no]check Check result (default: nocheck)\n" );
|
|
|
- printf( " --[no]warmup Perform a warmup run to pre-load libraries (default: warmup)\n");
|
|
|
- printf( " --parallel=N Use parallel tasks of size N (default: no)\n");
|
|
|
- printf( " --niter=N Number of iterations (default: 1)\n");
|
|
|
- printf( " --nb=N Nb size. Not used if autotuning is activated (default: 128)\n");
|
|
|
- printf( " --ib=N IB size. Not used if autotuning is activated (default: 32)\n");
|
|
|
- printf( " --nrhs=N Number of right-hand size (default: 1)\n");
|
|
|
- printf( " --[no]dyn Activate Dynamic scheduling (default: nodyn)\n");
|
|
|
- printf( " --[no]atun Activate autotuning (default: noatun)\n");
|
|
|
- printf( " --ifmt Input format. 0: CM, 1: CCRB, 2: CRRB, 3: RCRB, 4: RRRB, 5: RM (default: 0)\n");
|
|
|
- printf( " --ofmt Output format. 0: CM, 1: CCRB, 2: CRRB, 3: RCRB, 4: RRRB, 5: RM (default: 1)\n");
|
|
|
- printf( " --thrdbypb Number of threads per subproblem for inplace transformation (default: 1)\n");
|
|
|
- printf( " --[no]profile Profile kernels with StarPU (default: no)\n");
|
|
|
- printf( " --[no]peak Evalue sustained peak performance (default: no)\n");
|
|
|
-}
|
|
|
-static void
|
|
|
-get_thread_count(int *thrdnbr) {
|
|
|
-#if defined WIN32 || defined WIN64
|
|
|
- sscanf( getenv( "NUMBER_OF_PROCESSORS" ), "%d", thrdnbr );
|
|
|
-#else
|
|
|
- *thrdnbr = sysconf(_SC_NPROCESSORS_ONLN);
|
|
|
-#endif
|
|
|
-}
|
|
|
-
|
|
|
-typedef struct {
|
|
|
- PLASMA_enum uplo;
|
|
|
- magma_desc_t *descA;
|
|
|
- unsigned ctx;
|
|
|
- unsigned the_other_ctx;
|
|
|
- real_Double_t t;
|
|
|
-} params;
|
|
|
-
|
|
|
-double compute_flops(int n, int m)
|
|
|
-{
|
|
|
- double fp_per_mul, fp_per_add;
|
|
|
- if (sizeof(_TYPE) == sizeof(_PREC)) {
|
|
|
- fp_per_mul = 1;
|
|
|
- fp_per_add = 1;
|
|
|
- } else {
|
|
|
- fp_per_mul = 6;
|
|
|
- fp_per_add = 2;
|
|
|
- }
|
|
|
-
|
|
|
- double fmuls = (n * (1.0 / 6.0 * n + 0.5) * n);
|
|
|
- double fadds = (n * (1.0 / 6.0 * n ) * n);
|
|
|
- double flops = fmuls * fp_per_mul + fadds * fp_per_add;
|
|
|
- return flops;
|
|
|
-}
|
|
|
-params p1, p2;
|
|
|
-int
|
|
|
-main(int argc, char *argv[]) {
|
|
|
- int i;
|
|
|
- int start = 500;
|
|
|
- int stop = 5000;
|
|
|
- int step = 500;
|
|
|
-
|
|
|
- int start1 = 500;
|
|
|
- int stop1 = 5000;
|
|
|
- int step1 = 500;
|
|
|
-
|
|
|
- int start2 = 500;
|
|
|
- int stop2 = 5000;
|
|
|
- int step2 = 500;
|
|
|
-
|
|
|
- int start_cpus1 = 0, start_cpus2 = 0, start_gpus1 = 0, start_gpus2 = 0;
|
|
|
- int stop_cpus1 = -1, stop_cpus2 = -1, stop_gpus1 = -1, stop_gpus2 = -1;
|
|
|
- int step_cpus1 = 1, step_cpus2 = 1, step_gpus1 = 1, step_gpus2 = 1;
|
|
|
-
|
|
|
- int iparam[TIMING_INBPARAM];
|
|
|
-
|
|
|
- memset(iparam, 0, TIMING_INBPARAM*sizeof(int));
|
|
|
-
|
|
|
- iparam[TIMING_CHECK ] = 0;
|
|
|
- iparam[TIMING_WARMUP ] = 1;
|
|
|
- iparam[TIMING_NITER ] = 1;
|
|
|
- iparam[TIMING_N ] = 500;
|
|
|
- iparam[TIMING_N2 ] = 500;
|
|
|
- iparam[TIMING_NB ] = 128;
|
|
|
- iparam[TIMING_IB ] = 32;
|
|
|
- iparam[TIMING_NRHS ] = 1;
|
|
|
- iparam[TIMING_THRDNBR ] = 1;
|
|
|
- iparam[TIMING_NCUDAS ] = 0;
|
|
|
- iparam[TIMING_THRDNBR_SUBGRP] = 1;
|
|
|
- iparam[TIMING_SCHEDULER ] = 0;
|
|
|
- iparam[TIMING_AUTOTUNING ] = 1;
|
|
|
- iparam[TIMING_INPUTFMT ] = 0;
|
|
|
- iparam[TIMING_OUTPUTFMT ] = 0;
|
|
|
- iparam[TIMING_NDOM ] = 1;
|
|
|
- iparam[TIMING_PROFILE ] = 0;
|
|
|
- iparam[TIMING_PEAK ] = 0;
|
|
|
- iparam[TIMING_PARALLEL_TASKS] = 0;
|
|
|
- iparam[TIMING_NO_CPU ] = 0;
|
|
|
- iparam[TIMING_BOUND ] = 0;
|
|
|
- iparam[TIMING_BOUNDDEPS ] = 0;
|
|
|
- iparam[TIMING_BOUNDDEPSPRIO ] = 0;
|
|
|
- iparam[TIMING_WITH_CTXS ] = 1;
|
|
|
-
|
|
|
- get_thread_count( &(iparam[TIMING_THRDNBR]) );
|
|
|
-
|
|
|
- for (i = 1; i < argc && argv[i]; ++i) {
|
|
|
- if (startswith( argv[i], "--help" )) {
|
|
|
- show_help( argv[0] );
|
|
|
- return EXIT_SUCCESS;
|
|
|
- } else if (startswith( argv[i], "--n_cpus1=" )) {
|
|
|
- get_range( strchr( argv[i], '=' ) + 1, &start_cpus1, &stop_cpus1, &step_cpus1 );
|
|
|
- } else if (startswith( argv[i], "--n_cpus2=" )) {
|
|
|
- get_range( strchr( argv[i], '=' ) + 1, &start_cpus2, &stop_cpus2, &step_cpus2 );
|
|
|
- } else if (startswith( argv[i], "--n_gpus1=" )) {
|
|
|
- get_range( strchr( argv[i], '=' ) + 1, &start_gpus1, &stop_gpus1, &step_gpus1 );
|
|
|
- } else if (startswith( argv[i], "--n_gpus2=" )) {
|
|
|
- get_range( strchr( argv[i], '=' ) + 1, &start_gpus2, &stop_gpus2, &step_gpus2 );
|
|
|
- } else if (startswith( argv[i], "--n_range=" )) {
|
|
|
- get_range( strchr( argv[i], '=' ) + 1, &start, &stop, &step );
|
|
|
- } else if (startswith( argv[i], "--n_range1=" )) {
|
|
|
- get_range( strchr( argv[i], '=' ) + 1, &start1, &stop1, &step1 );
|
|
|
- } else if (startswith( argv[i], "--n_range2=" )) {
|
|
|
- get_range( strchr( argv[i], '=' ) + 1, &start2, &stop2, &step2 );
|
|
|
- } else if (startswith( argv[i], "--threads=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_THRDNBR]) );
|
|
|
- /* } else if (startswith( argv[i], "--gnuplot-png" )) { */
|
|
|
- /* gnuplot = 2; */
|
|
|
- /* } else if (startswith( argv[i], "--gnuplot" )) { */
|
|
|
- /* gnuplot = 1; */
|
|
|
- } else if (startswith( argv[i], "--noctxs" )) {
|
|
|
- iparam[TIMING_WITH_CTXS] = 0;
|
|
|
- } else if (startswith( argv[i], "--check" )) {
|
|
|
- iparam[TIMING_CHECK] = 1;
|
|
|
- } else if (startswith( argv[i], "--nocheck" )) {
|
|
|
- iparam[TIMING_CHECK] = 0;
|
|
|
- } else if (startswith( argv[i], "--warmup" )) {
|
|
|
- iparam[TIMING_WARMUP] = 1;
|
|
|
- } else if (startswith( argv[i], "--nowarmup" )) {
|
|
|
- iparam[TIMING_WARMUP] = 0;
|
|
|
- } else if (startswith( argv[i], "--dyn" )) {
|
|
|
- iparam[TIMING_SCHEDULER] = 1;
|
|
|
- } else if (startswith( argv[i], "--nodyn" )) {
|
|
|
- iparam[TIMING_SCHEDULER] = 0;
|
|
|
- } else if (startswith( argv[i], "--atun" )) {
|
|
|
- iparam[TIMING_AUTOTUNING] = 1;
|
|
|
- } else if (startswith( argv[i], "--noatun" )) {
|
|
|
- iparam[TIMING_AUTOTUNING] = 0;
|
|
|
- } else if (startswith( argv[i], "--profile" )) {
|
|
|
- iparam[TIMING_PROFILE] = 1;
|
|
|
- } else if (startswith( argv[i], "--peak" )) {
|
|
|
- iparam[TIMING_PEAK] = 1;
|
|
|
- } else if (startswith( argv[i], "--noprofile" )) {
|
|
|
- iparam[TIMING_PROFILE] = 0;
|
|
|
- } else if (startswith( argv[i], "--parallel=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_PARALLEL_TASKS]) );
|
|
|
- } else if (startswith( argv[i], "--noparallel" )) {
|
|
|
- iparam[TIMING_PARALLEL_TASKS] = 0;
|
|
|
- } else if (startswith( argv[i], "--nocpu" )) {
|
|
|
- iparam[TIMING_NO_CPU] = 1;
|
|
|
- } else if (startswith( argv[i], "--nb=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_NB]) );
|
|
|
- } else if (startswith( argv[i], "--m=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_M]) );
|
|
|
- } else if (startswith( argv[i], "--ib=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_IB]) );
|
|
|
- } else if (startswith( argv[i], "--nrhs=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_NRHS]) );
|
|
|
- } else if (startswith( argv[i], "--ifmt=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_INPUTFMT]) );
|
|
|
- } else if (startswith( argv[i], "--ofmt=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_OUTPUTFMT]) );
|
|
|
- } else if (startswith( argv[i], "--thrdbypb=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_THRDNBR_SUBGRP]) );
|
|
|
- } else if (startswith( argv[i], "--niter=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &iparam[TIMING_NITER] );
|
|
|
- } else if (startswith( argv[i], "--ndom=" )) {
|
|
|
- sscanf( strchr( argv[i], '=' ) + 1, "%d", &iparam[TIMING_NDOM] );
|
|
|
- } else if (startswith( argv[i], "--bounddepsprio" )) {
|
|
|
- iparam[TIMING_BOUND] = 1;
|
|
|
- iparam[TIMING_BOUNDDEPS] = 1;
|
|
|
- iparam[TIMING_BOUNDDEPSPRIO] = 1;
|
|
|
- } else if (startswith( argv[i], "--bounddeps" )) {
|
|
|
- iparam[TIMING_BOUND] = 1;
|
|
|
- iparam[TIMING_BOUNDDEPS] = 1;
|
|
|
- } else if (startswith( argv[i], "--bound" )) {
|
|
|
- iparam[TIMING_BOUND] = 1;
|
|
|
- } else {
|
|
|
- fprintf( stderr, "Unknown option: %s\n", argv[i] );
|
|
|
- }
|
|
|
- }
|
|
|
- if (step < 1) step = 1;
|
|
|
- if (step1 < 1) step1 = 1;
|
|
|
- if (step2 < 1) step2 = 1;
|
|
|
-
|
|
|
- /* TODO : correct into plasma */
|
|
|
- if ( iparam[TIMING_IB] > iparam[TIMING_NB] )
|
|
|
- iparam[TIMING_IB] = iparam[TIMING_NB];
|
|
|
-
|
|
|
- /* TODO */
|
|
|
- if (iparam[TIMING_PARALLEL_TASKS]) {
|
|
|
- MAGMA_InitPar(iparam[TIMING_THRDNBR]/iparam[TIMING_PARALLEL_TASKS],
|
|
|
- iparam[TIMING_NCUDAS],
|
|
|
- iparam[TIMING_PARALLEL_TASKS]);
|
|
|
- }
|
|
|
- else {
|
|
|
- MAGMA_Init( iparam[TIMING_THRDNBR],
|
|
|
- iparam[TIMING_NCUDAS]);
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
- MAGMA_Disable(MAGMA_AUTOTUNING);
|
|
|
- MAGMA_Set(MAGMA_TILE_SIZE, iparam[TIMING_NB] );
|
|
|
- MAGMA_Set(MAGMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] );
|
|
|
-
|
|
|
- if(iparam[TIMING_WITH_CTXS])
|
|
|
- {
|
|
|
- int nprocs1 = (stop_cpus1 - start_cpus1 + 1)/step_cpus1 + (stop_gpus1 - start_gpus1 + 1)/step_gpus1;
|
|
|
- int nprocs2 = (stop_cpus2 - start_cpus2 + 1)/step_cpus2 + (stop_gpus2 - start_gpus2 + 1)/step_gpus2;
|
|
|
- int procs1[nprocs1];
|
|
|
- int procs2[nprocs2];
|
|
|
- int i, j = 0;
|
|
|
- printf("%d: ", nprocs1);
|
|
|
- for (i = start_gpus1; i <= stop_gpus1; i += step_gpus1)
|
|
|
- {
|
|
|
- printf("%d ", i);
|
|
|
- procs1[j++] = i;
|
|
|
- }
|
|
|
-
|
|
|
- for (i = start_cpus1; i <= stop_cpus1; i += step_cpus1)
|
|
|
- {
|
|
|
- printf("%d ", i);
|
|
|
- procs1[j++] = i;
|
|
|
- }
|
|
|
- printf("\n");
|
|
|
-
|
|
|
- printf("%d: ", nprocs2);
|
|
|
- j = 0;
|
|
|
- for (i = start_gpus2; i <= stop_gpus2; i += step_gpus2)
|
|
|
- {
|
|
|
- printf("%d ", i);
|
|
|
- procs2[j++] = i;
|
|
|
- }
|
|
|
-
|
|
|
- for (i = start_cpus2; i <= stop_cpus2; i += step_cpus2)
|
|
|
- {
|
|
|
- printf("%d ", i);
|
|
|
- procs2[j++] = i;
|
|
|
- }
|
|
|
- printf("\n");
|
|
|
-
|
|
|
- struct starpu_sched_ctx_hypervisor_criteria *criteria = sched_ctx_hypervisor_init(SIMPLE_POLICY);
|
|
|
- p1.ctx = starpu_create_sched_ctx_with_criteria("heft", procs1, nprocs1, "sched_ctx1", &criteria);
|
|
|
-
|
|
|
- p2.ctx = starpu_create_sched_ctx_with_criteria("heft", procs2, nprocs2, "sched_ctx2", &criteria);
|
|
|
-
|
|
|
-/* p1.ctx = starpu_create_sched_ctx("heft", procs1, nprocs1, "sched_ctx1"); */
|
|
|
-
|
|
|
-/* p2.ctx = starpu_create_sched_ctx("heft", procs2, nprocs2, "sched_ctx2"); */
|
|
|
-
|
|
|
- double flops1 = compute_flops(start1, start1);
|
|
|
- double flops2 = compute_flops(start2, start2);
|
|
|
- printf("flops1 = %lf flops2 = %lf\n", flops1, flops2);
|
|
|
- sched_ctx_hypervisor_handle_ctx(p1.ctx, compute_flops(start1, start1));
|
|
|
-
|
|
|
- sched_ctx_hypervisor_handle_ctx(p2.ctx, compute_flops(start2, start2));
|
|
|
-
|
|
|
- p1.the_other_ctx = p2.ctx;
|
|
|
- p2.the_other_ctx = p1.ctx;
|
|
|
-
|
|
|
- int procs[12];
|
|
|
- for(i = 0; i < 12; i++)
|
|
|
- procs[i] = i;
|
|
|
-
|
|
|
- int gpus[3];
|
|
|
- for(i = 0; i < 3; i++)
|
|
|
- gpus[i] = i;
|
|
|
- sched_ctx_hypervisor_ioctl(p1.ctx,
|
|
|
- HYPERVISOR_GRANULARITY, 2,
|
|
|
- HYPERVISOR_MIN_TASKS, 10,
|
|
|
- HYPERVISOR_MIN_WORKERS, 3,
|
|
|
- HYPERVISOR_MAX_WORKERS, 12,
|
|
|
- HYPERVISOR_FIXED_WORKERS, gpus, 3,
|
|
|
-// HYPERVISOR_MAX_IDLE, procs, 12, 40000.0,
|
|
|
-// HYPERVISOR_MAX_IDLE, gpus, 3, 10000.0,
|
|
|
- NULL);
|
|
|
-
|
|
|
- sched_ctx_hypervisor_ioctl(p2.ctx,
|
|
|
- HYPERVISOR_GRANULARITY, 2,
|
|
|
- HYPERVISOR_MIN_TASKS, 10,
|
|
|
- HYPERVISOR_MIN_WORKERS, 0,
|
|
|
- HYPERVISOR_MAX_WORKERS, 12,
|
|
|
- HYPERVISOR_FIXED_WORKERS, gpus, 3,
|
|
|
-// HYPERVISOR_MAX_IDLE, procs, 12, 40000.0,
|
|
|
-// HYPERVISOR_MAX_IDLE, gpus, 3, 10000.0,
|
|
|
- NULL);
|
|
|
-
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
- p1.ctx = 0;
|
|
|
- p2.ctx = 0;
|
|
|
- }
|
|
|
-
|
|
|
- Test( -1, iparam ); /* print header */
|
|
|
-
|
|
|
- iparam[TIMING_N] = start1;
|
|
|
- iparam[TIMING_N2] = start2;
|
|
|
-
|
|
|
- if ( iparam[TIMING_M] == 0 )
|
|
|
- iparam[TIMING_M] = iparam[TIMING_N];
|
|
|
-
|
|
|
- if ( iparam[TIMING_M2] == 0 )
|
|
|
- iparam[TIMING_M2] = iparam[TIMING_N2];
|
|
|
-
|
|
|
- Test( start1, iparam );
|
|
|
-
|
|
|
- MAGMA_Finalize();
|
|
|
-
|
|
|
- if(iparam[TIMING_WITH_CTXS])
|
|
|
- sched_ctx_hypervisor_shutdown();
|
|
|
-
|
|
|
- /* if (gnuplot) { */
|
|
|
- /* printf( "%s\n%s\n", */
|
|
|
- /* "e", */
|
|
|
- /* gnuplot > 1 ? "" : "pause 10" ); */
|
|
|
- /* } */
|
|
|
-
|
|
|
- return EXIT_SUCCESS;
|
|
|
-}
|