13 years ago · 07ff07b555
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -192,7 +192,7 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 
				 
			
 
				 	/* control task should not execute post_exec_hook */
			
 
				 	if(task->cl != NULL && !task->control_task)
			
 
				-		_starpu_sched_post_exec_hook(task);
			
 
				+	  _starpu_sched_post_exec_hook(task);
			
 
				 
			
 
				 	STARPU_TRACE_TASK_DONE(j);
			
 
				 
			
@@ -238,11 +238,10 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 
				 		_starpu_decrement_nsubmitted_tasks();
			
 
				 	}
			
 
				 
			
 
				+	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
			
 
				+
			
 
				 	if(workerid >= 0)
			
 
				-	{
			
 
				 		_starpu_decrement_nsubmitted_tasks_of_worker(workerid);
			
 
				-		_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
			
 
				-	}			
			
 
				 }
			
 
				 
			
 
				 /* This function is called when a new task is submitted to StarPU 
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -386,7 +386,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
				 
			
 
				 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
			
 
				 
			
 
				-		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
			
 
				+		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
			
 
				 	}
			
 
				 
			
 
				 	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
			
@@ -429,15 +429,12 @@ static void benchmark_all_gpu_devices(void)
 
				 #endif
			
 
				 
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	ncpus = config->topology.ncpus;
			
 
				-
			
 
				-	/* TODO: measure bandwidth between GPU-GPU */
			
 
				+	ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	ncuda = _starpu_get_cuda_device_count();
			
 
				+        cudaGetDeviceCount(&ncuda);
			
 
				 	for (i = 0; i < ncuda; i++)
			
 
				 	{
			
 
				-		fprintf(stderr," CUDA %d...", i);
			
 
				 		/* measure bandwidth between Host and Device i */
			
 
				 		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
			
 
				 	}
			
@@ -446,7 +443,6 @@ static void benchmark_all_gpu_devices(void)
 
				         nopencl = _starpu_opencl_get_device_count();
			
 
				 	for (i = 0; i < nopencl; i++)
			
 
				 	{
			
 
				-		fprintf(stderr," OpenCL %d...", i);
			
 
				 		/* measure bandwith between Host and Device i */
			
 
				 		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
			
 
				 	}
			
@@ -481,7 +477,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 
				 	char hostname[32];
			
 
				 	char *forced_hostname = getenv("STARPU_HOSTNAME");
			
 
				 	if (forced_hostname && forced_hostname[0])
			
 
				-		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
			
 
				+		snprintf(hostname, sizeof(hostname), forced_hostname);
			
 
				 	else
			
 
				 		gethostname(hostname, sizeof(hostname));
			
 
				 	strncat(path, ".", maxlen);
			
@@ -509,11 +505,11 @@ static void load_bus_affinity_file_content(void)
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	ncpus = config->topology.ncpus;
			
 
				+	ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				         int gpu;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	ncuda = _starpu_get_cuda_device_count();
			
 
				+        cudaGetDeviceCount(&ncuda);
			
 
				 	for (gpu = 0; gpu < ncuda; gpu++)
			
 
				 	{
			
 
				 		int ret;
			
@@ -764,7 +760,7 @@ static void write_bus_latency_file_content(void)
 
				                                 latency = ((src && dst)?2000.0:500.0);
			
 
				 			}
			
 
				 
			
 
				-			fprintf(f, "%f\t", latency);
			
 
				+			fprintf(f, "%lf\t", latency);
			
 
				 		}
			
 
				 
			
 
				 		fprintf(f, "\n");
			
@@ -892,21 +888,23 @@ static void write_bus_bandwidth_file_content(void)
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				 			else if (src != dst)
			
 
				 			{
			
 
				-				double slowness_src_to_ram=0.0, slowness_ram_to_dst=0.0;
			
 
				-				/* Total bandwidth is the harmonic mean of bandwidths */
			
 
				+                                double time_src_to_ram=0.0, time_ram_to_dst=0.0;
			
 
				+                                double timing;
			
 
				+                                /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-				if (src && src <= ncuda)
			
 
				-					slowness_src_to_ram = cudadev_timing_dtoh[src]/cuda_size;
			
 
				-				if (dst && dst <= ncuda)
			
 
				-					slowness_ram_to_dst = cudadev_timing_htod[dst]/cuda_size;
			
 
				+				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
			
 
				+                                time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
			
 
				+				timing =time_src_to_ram + time_ram_to_dst;
			
 
				+				bandwidth = 1.0*cuda_size/timing;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-				if (src > ncuda)
			
 
				-					slowness_src_to_ram = opencldev_timing_dtoh[src-ncuda]/opencl_size;
			
 
				-				if (dst > ncuda)
			
 
				-					slowness_ram_to_dst = opencldev_timing_htod[dst-ncuda]/opencl_size;
			
 
				+                                if (src > ncuda)
			
 
				+                                        time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
			
 
				+                                if (dst > ncuda)
			
 
				+                                        time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
			
 
				+				timing =time_src_to_ram + time_ram_to_dst;
			
 
				+				bandwidth = 1.0*opencl_size/timing;
			
 
				 #endif
			
 
				-				bandwidth = 1.0/(slowness_src_to_ram + slowness_ram_to_dst);
			
 
				 			}
			
 
				 #endif
			
 
				 			else {
			
@@ -914,7 +912,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 			        bandwidth = 0.0;
			
 
				 			}
			
 
				 
			
 
				-			fprintf(f, "%f\t", bandwidth);
			
 
				+			fprintf(f, "%lf\t", bandwidth);
			
 
				 		}
			
 
				 
			
 
				 		fprintf(f, "\n");
			
@@ -923,38 +921,6 @@ static void write_bus_bandwidth_file_content(void)
 
				 	fclose(f);
			
 
				 }
			
 
				 
			
 
				-void starpu_print_bus_bandwidth(FILE *f)
			
 
				-{
			
 
				-	int src, dst, maxnode;
			
 
				-
			
 
				-        maxnode = ncuda;
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-        maxnode += nopencl;
			
 
				-#endif
			
 
				-
			
 
				-	fprintf(f, "from\t");
			
 
				-	fprintf(f, "to RAM\t\t");
			
 
				-	for (dst = 0; dst < ncuda; dst++)
			
 
				-		fprintf(f, "to CUDA %d\t", dst);
			
 
				-	for (dst = 0; dst < nopencl; dst++)
			
 
				-		fprintf(f, "to OpenCL %d\t", dst);
			
 
				-	fprintf(f, "\n");
			
 
				-
			
 
				-	for (src = 0; src <= maxnode; src++)
			
 
				-	{
			
 
				-		if (!src)
			
 
				-			fprintf(f, "RAM\t");
			
 
				-		else if (src <= ncuda)
			
 
				-			fprintf(f, "CUDA %d\t", src-1);
			
 
				-		else
			
 
				-			fprintf(f, "OpenCL%d\t", src-ncuda-1);
			
 
				-		for (dst = 0; dst <= maxnode; dst++)
			
 
				-			fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
			
 
				-
			
 
				-		fprintf(f, "\n");
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static void generate_bus_bandwidth_file(void)
			
 
				 {
			
 
				 	if (!was_benchmarked)
			
@@ -1020,9 +986,9 @@ static void check_bus_config_file()
 
				                 fclose(f);
			
 
				 
			
 
				                 // Loading current configuration
			
 
				-                ncpus = config->topology.ncpus;
			
 
				+                ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-		ncuda = _starpu_get_cuda_device_count();
			
 
				+                cudaGetDeviceCount(&ncuda);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				                 nopencl = _starpu_opencl_get_device_count();
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -293,9 +293,7 @@ int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
 
				 	unsigned no_workers = 0;
			
 
				 	unsigned nworkers; 
			
 
				        
			
 
				-//	PTHREAD_MUTEX_LOCK(&sched_ctx->changing_ctx_mutex);
			
 
				 	nworkers = sched_ctx->workers->nworkers;
			
 
				-//	PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);
			
 
				 
			
 
				 	if(nworkers == 0)
			
 
				 	{
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -353,17 +353,17 @@ int starpu_task_wait_for_all(void)
 
				 	unsigned sched_ctx = nsched_ctxs == 1 ? 0 : starpu_get_sched_ctx();
			
 
				 	starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
			
 
				 
			
 
				-	/* if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) */
			
 
				-	/* 	return -EDEADLK; */
			
 
				+/* 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) */
			
 
				+/* 		return -EDEADLK; */
			
 
				 
			
 
				-	/* PTHREAD_MUTEX_LOCK(&submitted_mutex); */
			
 
				+/* 	PTHREAD_MUTEX_LOCK(&submitted_mutex); */
			
 
				 
			
 
				-	/* STARPU_TRACE_TASK_WAIT_FOR_ALL; */
			
 
				+/* 	STARPU_TRACE_TASK_WAIT_FOR_ALL; */
			
 
				 
			
 
				-	/* while (nsubmitted > 0) */
			
 
				-	/* 	PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex); */
			
 
				+/* 	while (nsubmitted > 0) */
			
 
				+/* 		PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex); */
			
 
				 	
			
 
				-	/* PTHREAD_MUTEX_UNLOCK(&submitted_mutex); */
			
 
				+/* 	PTHREAD_MUTEX_UNLOCK(&submitted_mutex); */
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -203,7 +203,7 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 
				 }
			
 
				 
			
 
				 static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, int prio, unsigned sched_ctx_id)
			
 
				-{
			
 
				+ {
			
 
				 	/* make sure someone coule execute that task ! */
			
 
				 	STARPU_ASSERT(best_workerid != -1);
			
 
				 
			
@@ -303,6 +303,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 				local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 				local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
			
 
				 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
			
 
				+				//				printf("%d/%d: task length (%lf) exp_end (%lf) local_data_penalty (%lf)\n", worker, worker_ctx, local_task_length[worker_ctx], (exp_start[worker] + exp_len[worker] + local_task_length[worker_ctx]), local_data_penalty[worker_ctx]);
			
 
				 			}
			
 
				 			
			
 
				 			double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
			
@@ -390,7 +391,6 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 					    &max_exp_end, &best_exp_end,
			
 
				 					    local_data_penalty,
			
 
				 					    local_power, &forced_best, bundle, sched_ctx_id);
			
 
				-	
			
 
				 	/* If there is no prediction available for that task with that arch we
			
 
				 	 * want to speed-up calibration time so we force this measurement */
			
 
				 	if (forced_best != -1){
			
@@ -409,9 +409,9 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	while(workers->has_next(workers))
			
 
				 	{
			
 
				 		worker = workers->get_next(workers);
			
 
				-
			
 
				 		if (!starpu_worker_may_execute_task(worker, task, 0))
			
 
				 		{
			
 
				+		        worker_ctx++;
			
 
				 			/* no one on that queue may execute this task */
			
 
				 			continue;
			
 
				 		}