소스 검색

calibration bug fixed

Andra Hugo 13 년 전
부모
커밋
07ff07b555
5개의 변경된 파일36개의 추가작업 그리고 73개의 파일을 삭제
  1. 3 4
      src/core/jobs.c
  2. 23 57
      src/core/perfmodel/perfmodel_bus.c
  3. 0 2
      src/core/sched_policy.c
  4. 7 7
      src/core/task.c
  5. 3 3
      src/sched_policies/heft.c

+ 3 - 4
src/core/jobs.c

@@ -192,7 +192,7 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 
 	/* control task should not execute post_exec_hook */
 	if(task->cl != NULL && !task->control_task)
-		_starpu_sched_post_exec_hook(task);
+	  _starpu_sched_post_exec_hook(task);
 
 	STARPU_TRACE_TASK_DONE(j);
 
@@ -238,11 +238,10 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 		_starpu_decrement_nsubmitted_tasks();
 	}
 
+	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
+
 	if(workerid >= 0)
-	{
 		_starpu_decrement_nsubmitted_tasks_of_worker(workerid);
-		_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
-	}			
 }
 
 /* This function is called when a new task is submitted to StarPU 

+ 23 - 57
src/core/perfmodel/perfmodel_bus.c

@@ -386,7 +386,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 
-		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
+		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 	}
 
 	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
@@ -429,15 +429,12 @@ static void benchmark_all_gpu_devices(void)
 #endif
 
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	ncpus = config->topology.ncpus;
-
-	/* TODO: measure bandwidth between GPU-GPU */
+	ncpus = _starpu_topology_get_nhwcpu(config);
 
 #ifdef STARPU_USE_CUDA
-	ncuda = _starpu_get_cuda_device_count();
+        cudaGetDeviceCount(&ncuda);
 	for (i = 0; i < ncuda; i++)
 	{
-		fprintf(stderr," CUDA %d...", i);
 		/* measure bandwidth between Host and Device i */
 		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
 	}
@@ -446,7 +443,6 @@ static void benchmark_all_gpu_devices(void)
         nopencl = _starpu_opencl_get_device_count();
 	for (i = 0; i < nopencl; i++)
 	{
-		fprintf(stderr," OpenCL %d...", i);
 		/* measure bandwith between Host and Device i */
 		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
 	}
@@ -481,7 +477,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	if (forced_hostname && forced_hostname[0])
-		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
+		snprintf(hostname, sizeof(hostname), forced_hostname);
 	else
 		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
@@ -509,11 +505,11 @@ static void load_bus_affinity_file_content(void)
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	ncpus = config->topology.ncpus;
+	ncpus = _starpu_topology_get_nhwcpu(config);
         int gpu;
 
 #ifdef STARPU_USE_CUDA
-	ncuda = _starpu_get_cuda_device_count();
+        cudaGetDeviceCount(&ncuda);
 	for (gpu = 0; gpu < ncuda; gpu++)
 	{
 		int ret;
@@ -764,7 +760,7 @@ static void write_bus_latency_file_content(void)
                                 latency = ((src && dst)?2000.0:500.0);
 			}
 
-			fprintf(f, "%f\t", latency);
+			fprintf(f, "%lf\t", latency);
 		}
 
 		fprintf(f, "\n");
@@ -892,21 +888,23 @@ static void write_bus_bandwidth_file_content(void)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 			else if (src != dst)
 			{
-				double slowness_src_to_ram=0.0, slowness_ram_to_dst=0.0;
-				/* Total bandwidth is the harmonic mean of bandwidths */
+                                double time_src_to_ram=0.0, time_ram_to_dst=0.0;
+                                double timing;
+                                /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
 #ifdef STARPU_USE_CUDA
-				if (src && src <= ncuda)
-					slowness_src_to_ram = cudadev_timing_dtoh[src]/cuda_size;
-				if (dst && dst <= ncuda)
-					slowness_ram_to_dst = cudadev_timing_htod[dst]/cuda_size;
+				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
+                                time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
+				timing =time_src_to_ram + time_ram_to_dst;
+				bandwidth = 1.0*cuda_size/timing;
 #endif
 #ifdef STARPU_USE_OPENCL
-				if (src > ncuda)
-					slowness_src_to_ram = opencldev_timing_dtoh[src-ncuda]/opencl_size;
-				if (dst > ncuda)
-					slowness_ram_to_dst = opencldev_timing_htod[dst-ncuda]/opencl_size;
+                                if (src > ncuda)
+                                        time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
+                                if (dst > ncuda)
+                                        time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
+				timing =time_src_to_ram + time_ram_to_dst;
+				bandwidth = 1.0*opencl_size/timing;
 #endif
-				bandwidth = 1.0/(slowness_src_to_ram + slowness_ram_to_dst);
 			}
 #endif
 			else {
@@ -914,7 +912,7 @@ static void write_bus_bandwidth_file_content(void)
 			        bandwidth = 0.0;
 			}
 
-			fprintf(f, "%f\t", bandwidth);
+			fprintf(f, "%lf\t", bandwidth);
 		}
 
 		fprintf(f, "\n");
@@ -923,38 +921,6 @@ static void write_bus_bandwidth_file_content(void)
 	fclose(f);
 }
 
-void starpu_print_bus_bandwidth(FILE *f)
-{
-	int src, dst, maxnode;
-
-        maxnode = ncuda;
-#ifdef STARPU_USE_OPENCL
-        maxnode += nopencl;
-#endif
-
-	fprintf(f, "from\t");
-	fprintf(f, "to RAM\t\t");
-	for (dst = 0; dst < ncuda; dst++)
-		fprintf(f, "to CUDA %d\t", dst);
-	for (dst = 0; dst < nopencl; dst++)
-		fprintf(f, "to OpenCL %d\t", dst);
-	fprintf(f, "\n");
-
-	for (src = 0; src <= maxnode; src++)
-	{
-		if (!src)
-			fprintf(f, "RAM\t");
-		else if (src <= ncuda)
-			fprintf(f, "CUDA %d\t", src-1);
-		else
-			fprintf(f, "OpenCL%d\t", src-ncuda-1);
-		for (dst = 0; dst <= maxnode; dst++)
-			fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
-
-		fprintf(f, "\n");
-	}
-}
-
 static void generate_bus_bandwidth_file(void)
 {
 	if (!was_benchmarked)
@@ -1020,9 +986,9 @@ static void check_bus_config_file()
                 fclose(f);
 
                 // Loading current configuration
-                ncpus = config->topology.ncpus;
+                ncpus = _starpu_topology_get_nhwcpu(config);
 #ifdef STARPU_USE_CUDA
-		ncuda = _starpu_get_cuda_device_count();
+                cudaGetDeviceCount(&ncuda);
 #endif
 #ifdef STARPU_USE_OPENCL
                 nopencl = _starpu_opencl_get_device_count();

+ 0 - 2
src/core/sched_policy.c

@@ -293,9 +293,7 @@ int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
 	unsigned no_workers = 0;
 	unsigned nworkers; 
        
-//	PTHREAD_MUTEX_LOCK(&sched_ctx->changing_ctx_mutex);
 	nworkers = sched_ctx->workers->nworkers;
-//	PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);
 
 	if(nworkers == 0)
 	{

+ 7 - 7
src/core/task.c

@@ -353,17 +353,17 @@ int starpu_task_wait_for_all(void)
 	unsigned sched_ctx = nsched_ctxs == 1 ? 0 : starpu_get_sched_ctx();
 	starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
 
-	/* if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) */
-	/* 	return -EDEADLK; */
+/* 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) */
+/* 		return -EDEADLK; */
 
-	/* PTHREAD_MUTEX_LOCK(&submitted_mutex); */
+/* 	PTHREAD_MUTEX_LOCK(&submitted_mutex); */
 
-	/* STARPU_TRACE_TASK_WAIT_FOR_ALL; */
+/* 	STARPU_TRACE_TASK_WAIT_FOR_ALL; */
 
-	/* while (nsubmitted > 0) */
-	/* 	PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex); */
+/* 	while (nsubmitted > 0) */
+/* 		PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex); */
 	
-	/* PTHREAD_MUTEX_UNLOCK(&submitted_mutex); */
+/* 	PTHREAD_MUTEX_UNLOCK(&submitted_mutex); */
 	return 0;
 }
 

+ 3 - 3
src/sched_policies/heft.c

@@ -203,7 +203,7 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 }
 
 static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, int prio, unsigned sched_ctx_id)
-{
+ {
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
 
@@ -303,6 +303,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 				local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
 				local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
+				//				printf("%d/%d: task length (%lf) exp_end (%lf) local_data_penalty (%lf)\n", worker, worker_ctx, local_task_length[worker_ctx], (exp_start[worker] + exp_len[worker] + local_task_length[worker_ctx]), local_data_penalty[worker_ctx]);
 			}
 			
 			double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
@@ -390,7 +391,6 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 					    &max_exp_end, &best_exp_end,
 					    local_data_penalty,
 					    local_power, &forced_best, bundle, sched_ctx_id);
-	
 	/* If there is no prediction available for that task with that arch we
 	 * want to speed-up calibration time so we force this measurement */
 	if (forced_best != -1){
@@ -409,9 +409,9 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	while(workers->has_next(workers))
 	{
 		worker = workers->get_next(workers);
-
 		if (!starpu_worker_may_execute_task(worker, task, 0))
 		{
+		        worker_ctx++;
 			/* no one on that queue may execute this task */
 			continue;
 		}