ソースを参照

Cleanup the timing facilities: instead of storing date as "double", we now use
"struct timespec" which is more efficient to manipulate.

Cédric Augonnet 15 年 前
コミット
5664517e7a
共有5 個のファイルを変更した164 個の追加145 個の削除を含む
  1. 106 40
      src/common/timing.c
  2. 34 81
      src/common/timing.h
  3. 8 8
      src/drivers/cpu/driver_cpu.c
  4. 8 8
      src/drivers/cuda/driver_cuda.c
  5. 8 8
      src/drivers/opencl/driver_opencl.c

+ 106 - 40
src/common/timing.c

@@ -1,6 +1,6 @@
 /*
  * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,51 +16,83 @@
 
 #include "timing.h"
 
-static double reference_start_time;
-
 #ifdef HAVE_CLOCK_GETTIME
-
-#define TICK_DIFF(t1, t2) ((long long)((t2).ts.tv_sec*1e9 + (t2).ts.tv_nsec) + \
-				- (long long)((t1).ts.tv_sec*1e9) + (long long)(t1).ts.tv_nsec)
-#define TIMING_DELAY(t1, t2) _starpu_tick2usec(TICK_DIFF((t1), (t2)))
-
-void _starpu_timing_init(void)
-{
-	reference_start_time = _starpu_timing_now();
+#include <time.h>
+#ifndef _POSIX_C_SOURCE
+/* for clock_gettime */
+#define _POSIX_C_SOURCE 199309L
+#endif
+
+#ifdef __linux__
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif
+#endif
+
+static struct timespec reference_start_time_ts;
+
+/* Modern CPUs' clocks are usually not synchronized so we use a monotonic clock
+ * to have consistent timing measurements. The CLOCK_MONOTONIC_RAW clock is not
+ * subject to NTP adjustments, but is not available on all systems (in that
+ * case we use the CLOCK_MONOTONIC clock instead). */
+void __starpu_clock_gettime(struct timespec *ts) {
+#ifdef CLOCK_MONOTONIC_RAW
+	static int raw_supported = 0;
+	switch (raw_supported) {
+	case -1:
+		break;
+	case 1:
+		clock_gettime(CLOCK_MONOTONIC_RAW, ts);
+		return;
+	case 0:
+		if (clock_gettime(CLOCK_MONOTONIC_RAW, ts)) {
+			raw_supported = -1;
+			break;
+		} else {
+			raw_supported = 1;
+			return;
+		}
+	}
+#endif
+	clock_gettime(CLOCK_MONOTONIC, ts);
 }
 
-inline double _starpu_tick2usec(long long t)
+void _starpu_timing_init(void)
 {
-  return (double)(t)/1000;
+	__starpu_clock_gettime(&reference_start_time_ts);
 }
 
-inline double _starpu_timing_delay(starpu_tick_t *t1, starpu_tick_t *t2)
+void starpu_clock_gettime(struct timespec *ts)
 {
-	double d1, d2;
+	struct timespec absolute_ts;
 
-	d1 = _starpu_tick2usec((t1->ts.tv_sec*1e9) + t1->ts.tv_nsec);
-	d2 = _starpu_tick2usec((t2->ts.tv_sec*1e9) + t2->ts.tv_nsec);
+	/* Read the current time */
+	__starpu_clock_gettime(&absolute_ts);
 
-	return (d2 - d1);;
+	/* Compute the relative time since initialization */
+	_starpu_timespec_sub(&absolute_ts, &reference_start_time_ts, ts);
 }
 
-/* returns the current time in us */
-inline double _starpu_timing_now(void)
-{
-	starpu_tick_t tick_now;
-	STARPU_GET_TICK(tick_now);
+#else // !HAVE_CLOCK_GETTIME
 
-	double absolute_now = _starpu_tick2usec(((tick_now).ts.tv_sec*1e9) + (tick_now).ts.tv_nsec);
-
-	return (absolute_now - reference_start_time);
-}
+#if defined(__i386__) || defined(__pentium__) || defined(__pentiumpro__) || defined(__i586__) || defined(__i686__) || defined(__k6__) || defined(__k7__) || defined(__x86_64__)
+typedef union starpu_u_tick
+{
+  uint64_t tick;
 
-#else // HAVE_CLOCK_GETTIME
+  struct
+  {
+    uint32_t low;
+    uint32_t high;
+  }
+  sub;
+} starpu_tick_t;
 
+#define STARPU_GET_TICK(t) __asm__ volatile("rdtsc" : "=a" ((t).sub.low), "=d" ((t).sub.high))
 #define TICK_RAW_DIFF(t1, t2) ((t2).tick - (t1).tick)
 #define TICK_DIFF(t1, t2) (TICK_RAW_DIFF(t1, t2) - residual)
-#define TIMING_DELAY(t1, t2) _starpu_tick2usec(TICK_DIFF(t1, t2))
 
+static starpu_tick_t reference_start_tick;
 static double scale = 0.0;
 static unsigned long long residual = 0;
 
@@ -95,30 +127,64 @@ void _starpu_timing_init(void)
       (double)(TICK_DIFF(t1, t2));
   }
 
-  reference_start_time = _starpu_timing_now();
+  STARPU_GET_TICK(reference_start_tick);
 
   inited = 1;
 }
 
-inline double _starpu_tick2usec(long long t)
+void starpu_clock_gettime(struct timespec *ts)
 {
-  return (double)(t)*scale;
+	starpu_tick_t tick_now;
+
+	STARPU_GET_TICK(tick_now);
+
+	uint64_t elapsed_ticks = TICK_DIFF(reference_start_tick, tick_now);
+
+	/* We convert this number into nano-seconds so that we can fill the
+	 * timespec structure. */
+	uint64_t elapsed_ns = (uint64_t)(((double)elapsed_ticks)*(scale*1000.0));
+	
+	long tv_nsec = (elapsed_ns % 1000000000);
+	time_t tv_sec = (elapsed_ns / 1000000000);
+
+	ts->tv_sec = tv_sec;
+	ts->tv_nsec = tv_nsec;
 }
 
-inline double _starpu_timing_delay(starpu_tick_t *t1, starpu_tick_t *t2)
+#else // !HAVE_CLOCK_GETTIME & no rdtsc
+#warning StarPU could not find a timer, clock will always return 0
+void _starpu_timing_init(void)
 {
-	return TIMING_DELAY(*t1, *t2);
 }
 
-inline double _starpu_timing_now(void)
+void starpu_clock_gettime(struct timespec *ts)
 {
-	starpu_tick_t tick_now;
-	STARPU_GET_TICK(tick_now);
+	timerclear(ts);
+}
+#endif
+#endif // HAVE_CLOCK_GETTIME
 
-	double absolute_now =  _starpu_tick2usec(tick_now.tick);
+/* Returns the time elapsed between start and end in microseconds */
+double _starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end)
+{
+	struct timespec diff;
+	
+	_starpu_timespec_sub(end, start, &diff);
 
-	return (absolute_now - reference_start_time);
+	double us = (diff.tv_sec*1e6) + (diff.tv_nsec*1e-3);
 
+	return us;
 }
 
-#endif // HAVE_CLOCK_GETTIME
+double _starpu_timing_timespec_to_us(struct timespec *ts)
+{
+	return (ts->tv_sec*1e6) + (ts->tv_nsec*1e-3);
+}
+
+double _starpu_timing_now(void)
+{
+	struct timespec now;
+	starpu_clock_gettime(&now);
+
+	return _starpu_timing_timespec_to_us(&now);
+}

+ 34 - 81
src/common/timing.h

@@ -18,12 +18,8 @@
 #define TIMING_H
 
 /*
- * -- Initialiser la bibliothèque avec _starpu_timing_init();
- * -- Mémoriser un timestamp :
- *  starpu_tick_t t;
- *  STARPU_GET_TICK(t);
- * -- Calculer un intervalle en microsecondes :
- *  TIMING_DELAY(t1, t2);
+ * _starpu_timing_init must be called prior to using any of these timing
+ * functions.
  */
 
 #include <sys/time.h>
@@ -32,81 +28,38 @@
 #include <common/config.h>
 #include <starpu.h>
 
-#ifdef HAVE_CLOCK_GETTIME
-#include <time.h>
-#ifndef _POSIX_C_SOURCE
-/* for clock_gettime */
-#define _POSIX_C_SOURCE 199309L
-#endif
-
-/* we use the usual gettimeofday method */
-typedef struct starpu_tick_s
-{
-	struct timespec ts;
-} starpu_tick_t;
-
-#ifdef __linux__
-#ifndef CLOCK_MONOTONIC_RAW
-#define CLOCK_MONOTONIC_RAW 4
-#endif
-#endif
-/* Modern CPUs' clocks are usually not synchronized so we use a monotonic clock
- * to have consistent timing measurements. The CLOCK_MONOTONIC_RAW clock is not
- * subject to NTP adjustments, but is not available on all systems (in that
- * case we use the CLOCK_MONOTONIC clock instead). */
-static inline void starpu_get_tick(starpu_tick_t *t) {
-#ifdef CLOCK_MONOTONIC_RAW
-	static int raw_supported = 0;
-	switch (raw_supported) {
-	case -1:
-		break;
-	case 1:
-		clock_gettime(CLOCK_MONOTONIC_RAW, &t->ts);
-		return;
-	case 0:
-		if (clock_gettime(CLOCK_MONOTONIC_RAW, &t->ts)) {
-			raw_supported = -1;
-			break;
-		} else {
-			raw_supported = 1;
-			return;
-		}
-	}
-#endif
-	clock_gettime(CLOCK_MONOTONIC, &t->ts);
-}
-#define STARPU_GET_TICK(t) starpu_get_tick(&(t))
-
-#else // !HAVE_CLOCK_GETTIME
-
-typedef union starpu_u_tick
-{
-  uint64_t tick;
-
-  struct
-  {
-    uint32_t low;
-    uint32_t high;
-  }
-  sub;
-} starpu_tick_t;
-
-#if defined(__i386__) || defined(__pentium__) || defined(__pentiumpro__) || defined(__i586__) || defined(__i686__) || defined(__k6__) || defined(__k7__) || defined(__x86_64__)
-#  define STARPU_GET_TICK(t) __asm__ volatile("rdtsc" : "=a" ((t).sub.low), "=d" ((t).sub.high))
-#else
-//#  error "Processeur non-supporté par timing.h"
-/* XXX */
-//#warning "unsupported processor STARPU_GET_TICK returns 0"
-#  define STARPU_GET_TICK(t) do {} while(0);
-#endif
-
-#endif // HAVE_CLOCK_GETTIME
-
-void __attribute__ ((unused)) _starpu_timing_init(void);
-inline double __attribute__ ((unused)) _starpu_tick2usec(long long t);
-inline double __attribute__ ((unused)) _starpu_timing_delay(starpu_tick_t *t1, starpu_tick_t *t2);
-
-inline double __attribute__ ((unused)) _starpu_timing_now(void);
+/* Computes result = a + b */
+#define _starpu_timespec_add(a, b, result)				\
+	do {								\
+		(result)->tv_sec = (a)->tv_sec + (b)->tv_sec;		\
+   		(result)->tv_nsec = (a)->tv_nsec + (b)->tv_nsec; 	\
+		if ((result)->tv_nsec >= 1000000000)			\
+		{							\
+			++(result)->tv_sec;				\
+			(result)->tv_nsec -= 1000000000;		\
+		}							\
+	} while (0)
+
+
+/* Computes result = a - b */
+#define _starpu_timespec_sub(a, b, result)				\
+	do {								\
+		(result)->tv_sec = (a)->tv_sec - (b)->tv_sec;		\
+   		(result)->tv_nsec = (a)->tv_nsec - (b)->tv_nsec; 	\
+		if ((result)->tv_nsec < 0)				\
+		{							\
+			--(result)->tv_sec;				\
+			(result)->tv_nsec += 1000000000;		\
+		}							\
+	} while (0)
+
+void _starpu_timing_init(void);
+void starpu_clock_gettime(struct timespec *ts);
+double _starpu_timing_now(void);
+
+/* Returns the time elapsed between start and end in microseconds */
+double _starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end);
+double _starpu_timing_timespec_to_us(struct timespec *ts);
 
 #endif /* TIMING_H */
 

+ 8 - 8
src/drivers/cpu/driver_cpu.c

@@ -27,8 +27,8 @@
 static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args)
 {
 	int ret;
-	starpu_tick_t codelet_start, codelet_end;
-	starpu_tick_t codelet_start_comm, codelet_end_comm;
+	struct timespec codelet_start, codelet_end;
+	struct timespec codelet_start_comm, codelet_end_comm;
 	int64_t start_time;
 	int64_t end_time;
 
@@ -44,12 +44,12 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args)
 		calibrate_model = 1;
 
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
-		STARPU_GET_TICK(codelet_start_comm);
+		starpu_clock_gettime(&codelet_start_comm);
 
 	ret = _starpu_fetch_task_input(task, 0);
 
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
-		STARPU_GET_TICK(codelet_end_comm);
+		starpu_clock_gettime(&codelet_end_comm);
 
 	if (ret != 0) {
 		/* there was not enough memory so the codelet cannot be executed right now ... */
@@ -60,7 +60,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args)
 	STARPU_TRACE_START_CODELET_BODY(j);
 
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
-		STARPU_GET_TICK(codelet_start);
+		starpu_clock_gettime(&codelet_start);
 
 	int profiling_status = starpu_profiling_status_get();
 
@@ -76,7 +76,7 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args)
 	cl->per_worker_stats[workerid]++;
 	
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
-		STARPU_GET_TICK(codelet_end);
+		starpu_clock_gettime(&codelet_end);
 
 	if (profiling_status)
 		end_time = (int64_t)_starpu_timing_now();
@@ -101,8 +101,8 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args)
 
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
 	{
-		double measured = _starpu_timing_delay(&codelet_start, &codelet_end);
-		double measured_comm = _starpu_timing_delay(&codelet_start_comm, &codelet_end_comm);
+		double measured = _starpu_timing_timespec_delay_us(&codelet_start, &codelet_end);
+		double measured_comm = _starpu_timing_timespec_delay_us(&codelet_start_comm, &codelet_end_comm);
 
 //		fprintf(stderr, "%d\t%d\n", (int)j->penality, (int)measured_comm);
 		cpu_args->jobq->total_computation_time += measured;

+ 8 - 8
src/drivers/cuda/driver_cuda.c

@@ -90,8 +90,8 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 	struct starpu_task *task = j->task;
 
 	cudaError_t cures;
-	starpu_tick_t codelet_start, codelet_end;
-	starpu_tick_t codelet_start_comm, codelet_end_comm;
+	struct timespec codelet_start, codelet_end;
+	struct timespec codelet_start_comm, codelet_end_comm;
 	int64_t start_time;
 	int64_t end_time;
 
@@ -111,7 +111,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 		cures = cudaThreadSynchronize();
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
-		STARPU_GET_TICK(codelet_start_comm);
+		starpu_clock_gettime(&codelet_start_comm);
 	}
 
 	ret = _starpu_fetch_task_input(task, mask);
@@ -127,7 +127,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 		cures = cudaThreadSynchronize();
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
-		STARPU_GET_TICK(codelet_end_comm);
+		starpu_clock_gettime(&codelet_end_comm);
 	}
 
 	STARPU_TRACE_START_CODELET_BODY(j);
@@ -142,7 +142,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
 	cl_func func = cl->cuda_func;
 	STARPU_ASSERT(func);
-	STARPU_GET_TICK(codelet_start);
+	starpu_clock_gettime(&codelet_start);
 	func(task->interface, task->cl_arg);
 
 	cl->per_worker_stats[workerid]++;
@@ -165,7 +165,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 		_starpu_worker_update_profiling_info(workerid, end_time - start_time, 0, 1);
 
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
-		STARPU_GET_TICK(codelet_end);
+		starpu_clock_gettime(&codelet_end);
 
 	args->status = STATUS_UNKNOWN;
 
@@ -173,8 +173,8 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
 	{
-		double measured = _starpu_timing_delay(&codelet_start, &codelet_end);
-		double measured_comm = _starpu_timing_delay(&codelet_start_comm, &codelet_end_comm);
+		double measured = _starpu_timing_timespec_delay_us(&codelet_start, &codelet_end);
+		double measured_comm = _starpu_timing_timespec_delay_us(&codelet_start_comm, &codelet_end_comm);
 
 		args->jobq->total_computation_time += measured;
 		args->jobq->total_communication_time += measured_comm;

+ 8 - 8
src/drivers/opencl/driver_opencl.c

@@ -349,8 +349,8 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 	STARPU_ASSERT(j);
 	struct starpu_task *task = j->task;
 
-	starpu_tick_t codelet_start, codelet_end;
-	starpu_tick_t codelet_start_comm, codelet_end_comm;
+	struct timespec codelet_start, codelet_end;
+	struct timespec codelet_start_comm, codelet_end_comm;
 	int64_t start_time;
 	int64_t end_time;
 
@@ -368,7 +368,7 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 	if (STARPU_BENCHMARK_COMM)
 	{
                 //barrier(CLK_GLOBAL_MEM_FENCE);
-		STARPU_GET_TICK(codelet_start_comm);
+		starpu_clock_gettime(&codelet_start_comm);
 	}
 
 	ret = _starpu_fetch_task_input(task, mask);
@@ -382,7 +382,7 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
 	{
                 //barrier(CLK_GLOBAL_MEM_FENCE);
-		STARPU_GET_TICK(codelet_end_comm);
+		starpu_clock_gettime(&codelet_end_comm);
 	}
 
 	STARPU_TRACE_START_CODELET_BODY(j);
@@ -397,7 +397,7 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
 	cl_func func = cl->opencl_func;
 	STARPU_ASSERT(func);
-	STARPU_GET_TICK(codelet_start);
+	starpu_clock_gettime(&codelet_start);
 	func(task->interface, task->cl_arg);
 
 	cl->per_worker_stats[workerid]++;
@@ -418,7 +418,7 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 	if (profiling_status)
 		_starpu_worker_update_profiling_info(workerid, end_time - start_time, 0, 1);
 
-	STARPU_GET_TICK(codelet_end);
+	starpu_clock_gettime(&codelet_end);
 
 	args->status = STATUS_UNKNOWN;
 
@@ -426,8 +426,8 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
 	if (calibrate_model || STARPU_BENCHMARK_COMM)
 	{
-		double measured = _starpu_timing_delay(&codelet_start, &codelet_end);
-		double measured_comm = _starpu_timing_delay(&codelet_start_comm, &codelet_end_comm);
+		double measured = _starpu_timing_timespec_delay_us(&codelet_start, &codelet_end);
+		double measured_comm = _starpu_timing_timespec_delay_us(&codelet_start_comm, &codelet_end_comm);
 
 		args->jobq->total_computation_time += measured;
 		args->jobq->total_communication_time += measured_comm;