| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496 | 
							- /* StarPU --- Runtime system for heterogeneous multicore architectures.
 
-  *
 
-  * Copyright (C) 2016-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 
-  * Copyright (C) 2016       Bérangère Subervie
 
-  *
 
-  * StarPU is free software; you can redistribute it and/or modify
 
-  * it under the terms of the GNU Lesser General Public License as published by
 
-  * the Free Software Foundation; either version 2.1 of the License, or (at
 
-  * your option) any later version.
 
-  *
 
-  * StarPU is distributed in the hope that it will be useful, but
 
-  * WITHOUT ANY WARRANTY; without even the implied warranty of
 
-  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 
-  *
 
-  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-  */
 
- #include <stdbool.h>
 
- #include <starpu.h>
 
- #include <limits.h>
 
- #include "../helper.h"
 
- /*
 
-  * This tries to run kernels with different efficiency depending on the core
 
-  * frequency.
 
-  *
 
-  * This is based on the Cholesky factorization, which is made to exhibit three
 
-  * caricatural cases as follows:
 
-  *
 
-  * - gemm: always get faster with higher frequency
 
-  * - trsm: gets faster with higher frequency, but efficiency gets lower and
 
-  * lower
 
-  * - potrf: reaches a maximum performance, after which there is no point in
 
-  * running it at higher frequency.
 
-  *
 
-  * We here assume that the power use is the same for the different kernels
 
-  * (which wouldn't be true for real kernels, measurements would be needed, to
 
-  * feed the performance models).
 
-  */
 
- /* These are the different frequency and power parameters, as measured and
 
-  * provided to this program */
 
- static float freq_min, freq_fast;
 
- static float power_min, power_fast;
 
- /*
 
-  * This returns the dynamic power used by a CPU core in W at a given frequency
 
-  * in MHz
 
-  * This assumes C.V^2.F with V being proportional to F, thus C.F^3
 
-    freq_min = 1200
 
-    freq_fast = 3500
 
-    power_min = 2
 
-    power_fast = 8.2
 
-    freq_min3 = freq_min * freq_min * freq_min
 
-    freq_fast3 = freq_fast * freq_fast * freq_fast
 
-    alpha = (power_fast - power_min) / (freq_fast3 - freq_min3)
 
-    power(frequency) = power_min + alpha * (frequency*frequency*frequency - freq_min3)
 
-    plot [frequency=freq_min:freq_fast] power(frequency) lw 2
 
-  */
 
- static float power(float frequency)
 
- {
 
- 	double freq_min3 = freq_min * freq_min * freq_min;
 
- 	double freq_fast3 = freq_fast * freq_fast * freq_fast;
 
- 	double alpha = (power_fast - power_min) / (freq_fast3 - freq_min3);
 
- 	return power_min + alpha * ( frequency*frequency*frequency - freq_min3);
 
- }
 
- /*
 
-  * This returns the frequency of the given worker and implementation in MHz.
 
-  * This is where we can tune either a given number of cores at a low frequency,
 
-  * or which implementation uses which frequency. */
 
- /* These are the chosen parameters: how many cores get slowed down, at which
 
-  * frequency */
 
- static int ncpu_slow = -1;
 
- static float freq_slow;
 
- static float frequency(int worker, unsigned i)
 
- {
 
- 	if (ncpu_slow == -1)
 
- 	{
 
- 		/* Version that allows the runtime to switch speed between
 
- 		 * tasks, by exposing two implementations with different time
 
- 		 * and energy */
 
- 		if (i == 0)
 
- 			/* Slow implementation */
 
- 			return freq_slow;
 
- 		else
 
- 			/* Fast implementation */
 
- 			return freq_fast;
 
- 	}
 
- 	else
 
- 	{
 
- 		/* Version that assumes that ncpu_slow workers are running at
 
- 		 * slow speed */
 
- 		if (worker < ncpu_slow)
 
- 			return freq_slow;
 
- 		else
 
- 			return freq_fast;
 
- 	}
 
- }
 
- /* This is from magma
 
-   -- Innovative Computing Laboratory
 
-   -- Electrical Engineering and Computer Science Department
 
-   -- University of Tennessee
 
-   -- (C) Copyright 2009
 
-   Redistribution  and  use  in  source and binary forms, with or without
 
-   modification,  are  permitted  provided  that the following conditions
 
-   are met:
 
-   * Redistributions  of  source  code  must  retain  the above copyright
 
-     notice,  this  list  of  conditions  and  the  following  disclaimer.
 
-   * Redistributions  in  binary  form must reproduce the above copyright
 
-     notice,  this list of conditions and the following disclaimer in the
 
-     documentation  and/or other materials provided with the distribution.
 
-   * Neither  the  name of the University of Tennessee, Knoxville nor the
 
-     names of its contributors may be used to endorse or promote products
 
-     derived from this software without specific prior written permission.
 
-   THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 
-   ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 
-   LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 
-   A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 
-   HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 
-   SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
 
-   LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 
-   DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 
-   THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 
-   (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 
-   OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-   */
 
- #define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
 
- #define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
 
- #define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
 
- #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
 
- #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
 
- #define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
 
- #define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
 
- #define FMULS_TRSM FMULS_TRMM
 
- #define FADDS_TRSM FMULS_TRMM
 
- #define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
 
- #define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
 
- #define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
 
- #define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
 
- /* Tags for spotting tasks in the trace */
 
- #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
 
- #define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
 
- 					| (unsigned long long)(j))))
 
- #define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
 
- 					| ((unsigned long long)(i)<<16)	\
 
- 					| (unsigned long long)(j))))
 
- /* Arbitrary tile size */
 
- #define	TILE_SIZE	512
 
- /*
 
-  * Kernel time performance models, would normally be provided by measurements
 
-  */
 
- /* We assume that GEMM scales perfectly with frequency */
 
- #define GEMM_GFLOPS 50.	/* At full speed */
 
- #define GEMM_FLOPS(N) FLOPS_SGEMM(N, N, N)
 
- #define GEMM_TIME(N) (GEMM_FLOPS(TILE_SIZE) / (GEMM_GFLOPS * 1000000000.))
 
- static double _gemm_time(float frequency)
 
- {
 
- 	double ret = GEMM_TIME(N);
 
- 	/* Fix according to real frequency, linear */
 
- 	ret = GEMM_TIME(N) / (frequency / freq_fast);
 
- 	return ret * 1000000.;
 
- }
 
- static double gemm_time(struct starpu_task *t, unsigned workerid, unsigned i)
 
- {
 
- 	(void)t;
 
- 	return _gemm_time(frequency(workerid, i));
 
- }
 
- /* We assume that TRSM decays a bit with frequency */
 
- #define TRSM_DECAY 0.5
 
- #define TRSM_FLOPS(N) FLOPS_STRSM(N, N)
 
- static double _trsm_time(float frequency)
 
- {
 
- 	double ret = GEMM_TIME(N)*0.7; /* as typically observed */
 
- 	/* Fix according to real frequency, root */
 
- 	ret = ret / (pow(frequency - freq_min/2, TRSM_DECAY) / pow(freq_fast - freq_min/2, TRSM_DECAY));
 
- 	return ret * 1000000.;
 
- }
 
- static double trsm_time(struct starpu_task *t, unsigned workerid, unsigned i)
 
- {
 
- 	(void)t;
 
- 	return _trsm_time(frequency(workerid, i));
 
- }
 
- /* We assume that POTRF decays strongly with frequency */
 
- #define POTRF_DECAY 0.5
 
- #define POTRF_FLOPS(N) FLOPS_SPOTRF(N)
 
- static double _potrf_time(float frequency)
 
- {
 
- 	double ret = GEMM_TIME(N)*1.2; /* as typically observed */
 
- 	/* Fix according to real frequency, asymptote */
 
- 	ret = ret / (1. - POTRF_DECAY * ((freq_min/(frequency-freq_min/2)) - (freq_min/(freq_fast-freq_min/2))));
 
- 	return ret * 1000000.;
 
- }
 
- static double potrf_time(struct starpu_task *t, unsigned workerid, unsigned i)
 
- {
 
- 	(void)t;
 
- 	return _potrf_time(frequency(workerid, i));
 
- }
 
- /* stub for kernel, shouldn't be getting called in simgrid mode */
 
- void dummy_func(void *descr[], void *_args)
 
- {
 
- 	(void)descr; (void)_args;
 
- 	fprintf(stderr, "?? shouldn't be called\n");
 
- }
 
- /* Define the codelets */
 
- #define CODELET(kernel, nb, ...) \
 
- static double kernel##_energy(struct starpu_task *t, unsigned workerid, unsigned i) \
 
- { \
 
- 	double time = kernel##_time(t, workerid, i); \
 
- 	return power(frequency(workerid, i)) * time / 1000000.; \
 
- } \
 
- \
 
- static struct starpu_perfmodel kernel##_perf_model = \
 
- { \
 
- 	.symbol = #kernel, \
 
- 	.type = STARPU_PER_WORKER, \
 
- 	.worker_cost_function = kernel##_time, \
 
- }; \
 
- \
 
- static struct starpu_perfmodel kernel##_energy_model = \
 
- { \
 
- 	.symbol = #kernel "_energy", \
 
- 	.type = STARPU_PER_WORKER, \
 
- 	.worker_cost_function = kernel##_energy, \
 
- }; \
 
- \
 
- static struct starpu_codelet kernel##_cl = \
 
- { \
 
- 	.cpu_funcs = { dummy_func }, \
 
- 	.nbuffers = nb, \
 
- 	.modes = {__VA_ARGS__}, \
 
- 	.model = &kernel##_perf_model, \
 
- 	.energy_model = &kernel##_energy_model, \
 
- };
 
- CODELET(potrf, 1, STARPU_RW)
 
- CODELET(trsm, 2, STARPU_R, STARPU_RW)
 
- CODELET(gemm, 3, STARPU_R, STARPU_R, STARPU_RW)
 
- int main(int argc, char *argv[])
 
- {
 
- 	/* Initialize environment variables */
 
- 	if (!getenv("STARPU_IDLE_POWER"))
 
- 		setenv("STARPU_IDLE_POWER", "30", 1);
 
- 	const char *hostname = getenv("STARPU_HOSTNAME");
 
- 	if (!hostname || strcmp(hostname, "sirocco"))
 
- 	{
 
- 		printf("Warning: This is expected to be run with export STARPU_HOSTNAME=sirocco\n");
 
- 	}
 
- 	freq_min =  starpu_get_env_number_default("STARPU_FREQ_MIN", 1200);
 
- 	freq_slow =  starpu_get_env_number_default("STARPU_FREQ_SLOW", 1200);
 
- 	freq_fast =  starpu_get_env_number_default("STARPU_FREQ_FAST", 3500);
 
- 	power_min =  starpu_get_env_float_default("STARPU_POWER_MIN", 2);
 
- 	power_fast =  starpu_get_env_float_default("STARPU_POWER_FAST", 8.2);
 
- 	/* Number of slow CPU cores */
 
- 	ncpu_slow = starpu_get_env_number_default("STARPU_NCPU_SLOW", -1);
 
- 	if (ncpu_slow == -1)
 
- 	{
 
- 		/* Enable second implementation.  */
 
- 		potrf_cl.cpu_funcs[1] = dummy_func;
 
- 		trsm_cl.cpu_funcs[1] = dummy_func;
 
- 		gemm_cl.cpu_funcs[1] = dummy_func;
 
- 	}
 
- 	/* Initialize StarPU */
 
- 	struct starpu_conf conf;
 
- 	starpu_conf_init(&conf);
 
- 	conf.ncuda = 0;
 
- 	conf.nopencl = 0;
 
- 	if (!getenv("STARPU_SCHED"))
 
- 		conf.sched_policy_name = "dmdas";
 
- 	int ret = starpu_initialize(&conf, &argc, &argv);
 
- 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 
- 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
- 	unsigned N, k, m, n, iter, NITER;
 
- 	if (argc < 2)
 
- 		N = 40;
 
- 	else
 
- 		N = atoi(argv[1]);
 
- 	if (argc < 3)
 
- 		NITER = 10;
 
- 	else
 
- 		NITER = atoi(argv[2]);
 
- 	if (N == 0)
 
- 	{
 
- 		starpu_shutdown();
 
- 		return 0;
 
- 	}
 
- 	/* Give parameter summary to user */
 
- 	printf("freqs (MHz):\n");
 
- 	printf("%f %f %f\n", freq_min, freq_slow, freq_fast);
 
- 	printf("\n");
 
- 	printf("per-core power (W):\n");
 
- 	printf("%f %f\n", power_min, power_fast);
 
- 	printf("%f %f %f\n", power(freq_min), power(freq_slow), power(freq_fast));
 
- 	printf("\n");
 
- 	printf("kernel perfs in GFlops (min, slow, fast):\n");
 
- 	printf("gemm:\t%f %f %f\n",
 
- 			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_min) / 1000,
 
- 			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_slow) / 1000,
 
- 			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_fast) / 1000);
 
- 	printf("trsm:\t%f %f %f\n",
 
- 			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_min) / 1000,
 
- 			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_slow) / 1000,
 
- 			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_fast) / 1000);
 
- 	printf("potrf:\t%f %f %f\n",
 
- 			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_min) / 1000,
 
- 			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_slow) / 1000,
 
- 			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_fast) / 1000);
 
- 	printf("\n");
 
- 	printf("kernel efficiency in GFlops/W (min, slow, fast):\n");
 
- 	printf("gemm:\t%f %f %f\n",
 
- 			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_min) / 1000 / power(freq_min),
 
- 			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_slow) / 1000 / power(freq_slow),
 
- 			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_fast) / 1000 / power(freq_fast));
 
- 	printf("trsm:\t%f %f %f\n",
 
- 			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_min) / 1000 / power(freq_min),
 
- 			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_slow) / 1000 / power(freq_slow),
 
- 			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_fast) / 1000 / power(freq_fast));
 
- 	printf("potrf:\t%f %f %f\n",
 
- 			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_min) / 1000 / power(freq_min),
 
- 			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_slow) / 1000 / power(freq_slow),
 
- 			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_fast) / 1000 / power(freq_fast));
 
- 	printf("\n");
 
- 	/* Now compute */
 
- 	starpu_data_handle_t A[N][N];
 
- 	for (m = 0; m < N; m++)
 
- 		for (n = 0; n < N; n++)
 
- 			starpu_void_data_register(&A[m][n]);
 
- 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 
- 	double timing_sum = 0.;
 
- 	double energy_sum = 0.;
 
- 	double timing_sum2 = 0.;
 
- 	double energy_sum2 = 0.;
 
- 	for (iter = 0; iter < NITER; iter++)
 
- 	{
 
- 		double start = starpu_timing_now();
 
- 		double start_energy = starpu_energy_used();
 
- 		for (k = 0; k < N; k++)
 
- 		{
 
- 			starpu_iteration_push(k);
 
- 			ret = starpu_task_insert(&potrf_cl,
 
- 						 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k) : STARPU_MAX_PRIO,
 
- 						 STARPU_RW, A[k][k],
 
- 						 STARPU_FLOPS, (double) FLOPS_SPOTRF(TILE_SIZE),
 
- 						 STARPU_TAG_ONLY, TAG11(k),
 
- 						 0);
 
- 			if (ret == -ENODEV) return 77;
 
- 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
- 			for (m = k+1; m<N; m++)
 
- 			{
 
- 				ret = starpu_task_insert(&trsm_cl,
 
- 							 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 
- 							 STARPU_R, A[k][k],
 
- 							 STARPU_RW, A[m][k],
 
- 							 STARPU_FLOPS, (double) FLOPS_STRSM(TILE_SIZE, TILE_SIZE),
 
- 							 STARPU_TAG_ONLY, TAG21(m,k),
 
- 							 0);
 
- 				if (ret == -ENODEV) return 77;
 
- 				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
- 			}
 
- 			for (m = k+1; m<N; m++)
 
- 			{
 
- 				for (n = k+1; n<N; n++)
 
- 				{
 
- 					if (n <= m)
 
- 					{
 
- 						ret = starpu_task_insert(&gemm_cl,
 
- 									 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 
- 									 STARPU_R, A[m][k],
 
- 									 STARPU_R, A[n][k],
 
- 									 gemm_cl.modes[2], A[m][n],
 
- 									 STARPU_FLOPS, (double) FLOPS_SGEMM(TILE_SIZE, TILE_SIZE, TILE_SIZE),
 
- 									 STARPU_TAG_ONLY, TAG22(k,m,n),
 
- 									 0);
 
- 						if (ret == -ENODEV) return 77;
 
- 						STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
- 					}
 
- 				}
 
- 			}
 
- 			starpu_iteration_pop();
 
- 		}
 
- 		starpu_task_wait_for_all();
 
- 		double end = starpu_timing_now();
 
- 		double end_energy = starpu_energy_used();
 
- 		double timing = end - start;
 
- 		double energy = end_energy - start_energy;
 
- 		timing_sum += timing;
 
- 		timing_sum2 += timing*timing;
 
- 		energy_sum += energy;
 
- 		energy_sum2 += energy*energy;
 
- 	}
 
- 	/* Make stats and print */
 
- 	double timing_avg = timing_sum / NITER;
 
- 	double timing_dev = sqrt((fabs(timing_sum2 - (timing_sum*timing_sum)/NITER))/NITER);
 
- 	double energy_avg = energy_sum / NITER;
 
- 	double energy_dev = sqrt((fabs(energy_sum2 - (energy_sum*energy_sum)/NITER))/NITER);
 
- 	double flop = FLOPS_SPOTRF(TILE_SIZE * N);
 
- 	unsigned toprint_slow;
 
- 	if (ncpu_slow >= 0)
 
- 		toprint_slow = ncpu_slow;
 
- 	else
 
- 		toprint_slow = freq_slow;
 
- 	printf("# size\t%s\tms +-\tGFlop/s +-\ten. (J) +-\tGF/W\n",
 
- 			ncpu_slow >= 0 ? "nslow" : "fslow");
 
- 	printf("%u\t%u\t%.0f %.1f\t%.1f %.1f\t%.1f %.1f\t%.2f\n",
 
- 			TILE_SIZE * N,
 
- 			toprint_slow,
 
- 			timing_avg/1000,
 
- 			timing_dev/1000,
 
- 			(flop/timing_avg/1000.0f),
 
- 			(flop/(timing_avg*timing_avg)/1000.f)*timing_dev,
 
- 			energy_avg, energy_dev,
 
- 			flop/1000000000./energy_avg);
 
- 	for (m = 0; m < N; m++)
 
- 		for (n = 0; n < N; n++)
 
- 			starpu_data_unregister(A[m][n]);
 
- out:
 
- 	starpu_shutdown();
 
- 	return 0;
 
- }
 
 
  |