浏览代码

merge trunk

Nathalie Furmento 8 年之前
父节点
当前提交
c415b96e18

+ 3 - 0
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -13,6 +13,9 @@ TODO: improve!
 To achieve good
 To achieve good
 performance, we give below a list of features which should be checked.
 performance, we give below a list of features which should be checked.
 
 
+For a start, you can use \ref OfflinePerformanceTools to get a Gantt chart which
+will show roughly where time is spent, and focus correspondingly.
+
 \section ConfigurationImprovePerformance Configuration That May Improve Performance
 \section ConfigurationImprovePerformance Configuration That May Improve Performance
 
 
 The \ref enable-fast "--enable-fast" configuration option disables all
 The \ref enable-fast "--enable-fast" configuration option disables all

+ 3 - 0
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -123,6 +123,9 @@ $ vite paje.trace
 To get names of tasks instead of "unknown", fill the optional
 To get names of tasks instead of "unknown", fill the optional
 starpu_codelet::name, or use a performance model for them.
 starpu_codelet::name, or use a performance model for them.
 
 
+One can also introduce user-defined events in the diagram thanks to the
+starpu_fxt_trace_user_event_string() function.
+
 In the MPI execution case, \ref STARPU_GENERATE_TRACE will not work as expected
 In the MPI execution case, \ref STARPU_GENERATE_TRACE will not work as expected
 (each node will try to generate paje.trace, thus mixing outputs...), you have to
 (each node will try to generate paje.trace, thus mixing outputs...), you have to
 collect the trace files from the MPI nodes, and
 collect the trace files from the MPI nodes, and

+ 3 - 1
doc/doxygen/chapters/410_mpi_support.doxy

@@ -190,7 +190,9 @@ int main(int argc, char **argv)
 
 
 We have here replaced <c>MPI_Recv()</c> and <c>MPI_Send()</c> with starpu_mpi_irecv_detached()
 We have here replaced <c>MPI_Recv()</c> and <c>MPI_Send()</c> with starpu_mpi_irecv_detached()
 and starpu_mpi_isend_detached(), which just submit the communication to be
 and starpu_mpi_isend_detached(), which just submit the communication to be
-performed. The only remaining synchronization with starpu_data_acquire() is at
+performed. The implicit sequential consistency dependencies provide
+synchronization between mpi reception and emission and the corresponding tasks.
+The only remaining synchronization with starpu_data_acquire() is at
 the beginning and the end.
 the beginning and the end.
 
 
 \section MPIInitialization How to Initialize StarPU-MPI
 \section MPIInitialization How to Initialize StarPU-MPI

+ 1 - 1
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -86,7 +86,7 @@ void parse_args(int argc, char **argv, int nodes)
 
 
                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 {
                 {
-                        printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
+			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display]\n", argv[0]);
                 }
                 }
         }
         }
 
 

+ 1 - 1
mpi/examples/mpi_lu/pxlu.c

@@ -555,8 +555,8 @@ static void create_task_21_recv(unsigned k, unsigned i)
 				tag_array[ndeps++] = TAG22(k-1, i, j);
 				tag_array[ndeps++] = TAG22(k-1, i, j);
 #else
 #else
 				tag_array[ndeps++] = TAG22(k-2, i, j);
 				tag_array[ndeps++] = TAG22(k-2, i, j);
-		}
 #endif
 #endif
+		}
 	}
 	}
 
 
 	int source = get_block_rank(i, k);
 	int source = get_block_rank(i, k);

+ 1 - 1
src/core/disk.c

@@ -243,7 +243,7 @@ int _starpu_disk_copy(unsigned node_src, void *obj_src, off_t offset_src, unsign
 	/* Something goes wrong with copy disk to disk... */
 	/* Something goes wrong with copy disk to disk... */
 	if (!event)
 	if (!event)
 	{
 	{
-		if (channel || (!channel && starpu_asynchronous_copy_disabled()))
+		if (channel || starpu_asynchronous_copy_disabled())
 			disk_register_list[node_src]->functions->copy = NULL;
 			disk_register_list[node_src]->functions->copy = NULL;
 
 
 		/* perform a read, and after a write... */
 		/* perform a read, and after a write... */

+ 0 - 1
src/core/disk_ops/unistd/disk_unistd_global.c

@@ -949,7 +949,6 @@ int starpu_unistd_global_test_request(void *async_channel)
 		case STARPU_UNISTD_COPY :
 		case STARPU_UNISTD_COPY :
 		{
 		{
 			return starpu_sem_trywait(&event->event.event_copy->finished) == 0;
 			return starpu_sem_trywait(&event->event.event_copy->finished) == 0;
-			break;
 		}
 		}
 #endif
 #endif
 
 

+ 6 - 1
src/core/perfmodel/perfmodel_bus.c

@@ -626,7 +626,12 @@ static void measure_bandwidth_between_numa_nodes_and_dev(int dev, struct dev_tim
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 		hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, numa_id);
 		hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, numa_id);
 
 
-		cpu_id = find_cpu_from_numa_node(obj);
+		if (obj)
+			cpu_id = find_cpu_from_numa_node(obj);
+		else
+                        /* No such NUMA node, probably hwloc 1.x with no NUMA
+                         * node, just take one CPU from the whole system */
+			cpu_id = find_cpu_from_numa_node(hwloc_get_root_obj(hwtopology));
 #endif
 #endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA

+ 7 - 7
src/core/topology.c

@@ -1314,13 +1314,13 @@ _starpu_topology_count_ngpus(hwloc_obj_t obj)
 		n += _starpu_topology_count_ngpus(obj->children[i]);
 		n += _starpu_topology_count_ngpus(obj->children[i]);
 
 
 	data->ngpus = n;
 	data->ngpus = n;
-#ifdef STARPU_VERBOSE
-	{
-		char name[64];
-		hwloc_obj_type_snprintf(name, sizeof(name), obj, 0);
-		_STARPU_DEBUG("hwloc obj %s has %u GPUs below\n", name, n);
-	}
-#endif
+//#ifdef STARPU_VERBOSE
+//	{
+//		char name[64];
+//		hwloc_obj_type_snprintf(name, sizeof(name), obj, 0);
+//		_STARPU_DEBUG("hwloc obj %s has %u GPUs below\n", name, n);
+//	}
+//#endif
 	return n;
 	return n;
 }
 }
 #endif
 #endif

+ 1 - 1
src/datawizard/copy_driver.c

@@ -997,7 +997,7 @@ unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *as
 		break;
 		break;
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
 	default:
 	default:
-		STARPU_ABORT_MSG("Memory is not recognized (kind %u) \n", kind);
+		STARPU_ABORT_MSG("Memory is not recognized (kind %d) \n", kind);
 	}
 	}
 
 
 	return success;
 	return success;

+ 4 - 4
src/debug/traces/starpu_fxt.c

@@ -741,7 +741,7 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 	worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
 	worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
 	poti_SetState(time, container, "WS", name);
 	poti_SetState(time, container, "WS", name);
 #else
 #else
-	fprintf(out_paje_file, "10	%.9f	%sw%lu	WS	%s\n", time, prefix, workerid, name);
+	fprintf(out_paje_file, "10	%.9f	%sw%lu	WS	\"%s\"\n", time, prefix, workerid, name);
 #endif
 #endif
 }
 }
 
 
@@ -1474,7 +1474,7 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[2]);
 			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[2]);
 			poti_SetState(start_codelet_time, container, ctx, name);
 			poti_SetState(start_codelet_time, container, ctx, name);
 #else
 #else
-			fprintf(out_paje_file, "10	%.9f	%sw%"PRIu64"	Ctx%d	%s\n", start_codelet_time, prefix, ev->param[2], sched_ctx, name);
+			fprintf(out_paje_file, "10	%.9f	%sw%"PRIu64"	Ctx%d	\"%s\"\n", start_codelet_time, prefix, ev->param[2], sched_ctx, name);
 #endif
 #endif
 		}
 		}
 	}
 	}
@@ -1636,7 +1636,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 			poti_SetState(last_codelet_start[worker], container, typectx, name);
 			poti_SetState(last_codelet_start[worker], container, typectx, name);
 #endif
 #endif
 #else
 #else
-			fprintf(out_paje_file, "21	%.9f	%sw%d	Ctx%u	%s	%ld	%s	%08lx	%016lx	%s%lu	%s%lu\n", last_codelet_start[worker], prefix, worker, sched_ctx, _starpu_last_codelet_symbol[worker], ev->param[1], parameters,  ev->param[2], ev->param[4], prefix, job_id, prefix, task->submit_order);
+			fprintf(out_paje_file, "21	%.9f	%sw%d	Ctx%u	\"%s\"	%ld	%s	%08lx	%016lx	%s%lu	%s%lu\n", last_codelet_start[worker], prefix, worker, sched_ctx, _starpu_last_codelet_symbol[worker], ev->param[1], parameters,  ev->param[2], ev->param[4], prefix, job_id, prefix, task->submit_order);
 #endif
 #endif
 		}
 		}
 	}
 	}
@@ -2995,7 +2995,7 @@ static void handle_string_event(struct fxt_ev_64 *ev, const char *event, struct
 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
 		poti_NewEvent(get_event_time_stamp(ev, options), container, "prog_event", event);
 		poti_NewEvent(get_event_time_stamp(ev, options), container, "prog_event", event);
 #else
 #else
-		fprintf(out_paje_file, "9	%.9f	prog_event	%sp	%s\n", get_event_time_stamp(ev, options), options->file_prefix, event);
+		fprintf(out_paje_file, "9	%.9f	prog_event	%sp	\"%s\"\n", get_event_time_stamp(ev, options), options->file_prefix, event);
 #endif
 #endif
 	}
 	}