瀏覽代碼

move documentation to independant doxygen files into public .h files

Nathalie Furmento 6 年之前
父節點
當前提交
6862bbddbf

+ 0 - 3
doc/doxygen/Makefile.am

@@ -116,8 +116,6 @@ chapters =	\
 	chapters/api/mic_extensions.doxy \
 	chapters/api/mic_extensions.doxy \
 	chapters/api/scc_extensions.doxy \
 	chapters/api/scc_extensions.doxy \
 	chapters/api/parallel_tasks.doxy \
 	chapters/api/parallel_tasks.doxy \
-	chapters/api/performance_model.doxy \
-	chapters/api/profiling.doxy \
 	chapters/api/scheduling_contexts.doxy \
 	chapters/api/scheduling_contexts.doxy \
 	chapters/api/scheduling_policy.doxy \
 	chapters/api/scheduling_policy.doxy \
 	chapters/api/standard_memory_library.doxy \
 	chapters/api/standard_memory_library.doxy \
@@ -131,7 +129,6 @@ chapters =	\
 	chapters/api/toolbox.doxy \
 	chapters/api/toolbox.doxy \
 	chapters/api/sc_hypervisor/sc_hypervisor.doxy \
 	chapters/api/sc_hypervisor/sc_hypervisor.doxy \
 	chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy \
 	chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy \
-	chapters/api/modularized_scheduler.doxy \
 	chapters/api/interoperability.doxy
 	chapters/api/interoperability.doxy
 
 
 images = 	\
 images = 	\

+ 0 - 560
doc/doxygen/chapters/api/modularized_scheduler.doxy

@@ -1,560 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2013,2014                                Inria
- * Copyright (C) 2013-2018                                CNRS
- * Copyright (C) 2009-2011,2014,2015,2017,2018-2019       Université de Bordeaux
- * Copyright (C) 2013                                     Simon Archipoff
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \defgroup API_Modularized_Scheduler Modularized Scheduler Interface
-
-\enum starpu_sched_component_properties
-\ingroup API_Modularized_Scheduler
-flags for starpu_sched_component::properties
-\var starpu_sched_component_properties::STARPU_SCHED_COMPONENT_HOMOGENEOUS
-     indicate that all workers have the same starpu_worker_archtype
-\var starpu_sched_component_properties::STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE
-     indicate that all workers have the same memory component
-
-\def STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS
-\ingroup API_Modularized_Scheduler
-indicate if component is homogeneous
-\def STARPU_SCHED_COMPONENT_IS_SINGLE_MEMORY_NODE
-\ingroup API_Modularized_Scheduler
-indicate if all workers have the same memory component
-
-\struct starpu_sched_component
-\ingroup API_Modularized_Scheduler
-This structure represent a scheduler module.  A scheduler is a
-tree-like structure of them, some parts of scheduler can be shared by
-several contexes to perform some local optimisations, so, for all
-components, a list of parent is defined by \c sched_ctx_id. They
-embed there specialised method in a pseudo object-style, so calls are
-like <c>component->push_task(component,task)</c>
-
-\var struct starpu_sched_tree *starpu_sched_component::tree
-     The tree containing the component
-\var struct starpu_bitmap *starpu_sched_component::workers
-     this member contain the set of underlying workers
-\var starpu_sched_component::workers_in_ctx
-     this member contain the subset of starpu_sched_component::workers that is currently available in the context
-     The push method should take this member into account.
-     this member is set with :
-     component->workers UNION tree->workers UNION
-     component->child[i]->workers_in_ctx iff exist x such as component->children[i]->parents[x] == component
-\var void *starpu_sched_component::data
-     private data
-\var int starpu_sched_component::nchildren
-     the number of compoments's children
-\var struct starpu_sched_component **starpu_sched_component::children
-     the vector of component's children
-\var int starpu_sched_component::nparents
-     the numbers of component's parents
-\var struct starpu_sched_component **starpu_sched_component::parents
-     the vector of component's parents
-
-\var void(*starpu_sched_component::add_child)(struct starpu_sched_component *component, struct starpu_sched_component *child)
-     add a child to component
-\var void(*starpu_sched_component::remove_child)(struct starpu_sched_component *component, struct starpu_sched_component *child)
-     remove a child from component
-\var void(*starpu_sched_component::add_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent)
-     todo
-\var void(*starpu_sched_component::remove_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent)
-     todo
-
-\var int (*starpu_sched_component::push_task)(struct starpu_sched_component *, struct starpu_task *)
-     push a task in the scheduler module. this function is called to
-     push a task on component subtree, this can either perform a
-     recursive call on a child or store the task in the component,
-     then it will be returned by a further pull_task call.
-     the caller must ensure that component is able to execute task.
-     This method must either return 0 if it the task was properly stored or
-     passed over to a child component, or return a value different from 0 if the
-     task could not be consumed (e.g. the queue is full).
-\var struct starpu_task * (*starpu_sched_component::pull_task)(struct starpu_sched_component *component, struct starpu_sched_component *to)
-     pop a task from the scheduler module. this function is called by workers to get a task from their
-     parents. this function should first return a locally stored task
-     or perform a recursive call on the parents.
-     the task returned by this function should be executable by the caller
-
-\var int (*starpu_sched_component::can_push)(struct starpu_sched_component *component, struct starpu_sched_component *to)
-     This function is called by a component which implements a queue,
-     allowing it to signify to its parents that an empty slot is
-     available in its queue. This should return 1 if some tasks could be pushed
-     The basic implementation of this function
-     is a recursive call to its parents, the user has to specify a
-     personally-made function to catch those calls.
-\var int (*starpu_sched_component::can_pull)(struct starpu_sched_component *component)
-     This function allow a component to wake up a worker. It is
-     currently called by component which implements a queue, to
-     signify to its children that a task have been pushed in its local
-     queue, and is available to be popped by a worker, for example.
-     This should return 1 if some some container or worker could (or will) pull
-     some tasks.
-     The basic implementation of this function is a recursive call to
-     its children, until at least one worker have been woken up.
-
-\var double (*starpu_sched_component::estimated_load)(struct starpu_sched_component *component)
-	is an heuristic to compute load of scheduler module. Basically the number of tasks divided by the sum
-	of relatives speedup of workers available in context.
-	estimated_load(component) = sum(estimated_load(component_children)) + nb_local_tasks / average(relative_speedup(underlying_worker))
-\var starpu_sched_component::estimated_end
-	return the time when a worker will enter in starvation. This function is relevant only if the task->predicted
-	member has been set.
-
-\var void (*starpu_sched_component::deinit_data)(struct starpu_sched_component *component)
-	called by starpu_sched_component_destroy. Should free data allocated during creation
-\var void (*starpu_sched_component::notify_change_workers)(struct starpu_sched_component *component)
-	this function is called for each component when workers are added or removed from a context
-\var int starpu_sched_component::properties
-	todo
-\var hwloc_obj_t starpu_sched_component::obj
-	the hwloc object associated to scheduler module. points to the
-	part of topology that is binded to this component, eg: a numa
-	node for a ws component that would balance load between
-	underlying sockets
-
-\struct starpu_sched_tree
-\ingroup API_Modularized_Scheduler
-The actual scheduler
-\var struct starpu_sched_component *starpu_sched_tree::root
-	this is the entry module of the scheduler
-\var struct starpu_bitmap *starpu_sched_tree::workers
-	this is the set of workers available in this context, this value is used to mask workers in modules
-\var unsigned starpu_sched_tree::sched_ctx_id
-	the context id of the scheduler
-\var starpu_pthread_mutex_t starpu_sched_tree::lock
-	this lock is used to protect the scheduler, it is taken in
-	read mode pushing a task and in write mode for adding or
-	removing workers
-
-@name Scheduling Tree API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_tree *starpu_sched_tree_create(unsigned sched_ctx_id)
-\ingroup API_Modularized_Scheduler
-	 create a empty initialized starpu_sched_tree
-
-\fn void starpu_sched_tree_destroy(struct starpu_sched_tree *tree)
-\ingroup API_Modularized_Scheduler
-	 destroy tree and free all non shared component in it.
-
-\fn void starpu_sched_tree_update_workers(struct starpu_sched_tree *t)
-\ingroup API_Modularized_Scheduler
-	 recursively set all starpu_sched_component::workers, do not take into account shared parts (except workers).
-
-\fn void starpu_sched_tree_update_workers_in_ctx(struct starpu_sched_tree *t)
-\ingroup API_Modularized_Scheduler
-	 recursively set all starpu_sched_component::workers_in_ctx, do not take into account shared parts (except workers)
-
-\fn int starpu_sched_tree_push_task(struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-\fn struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-\fn void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-\fn void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-\fn void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child)
-\ingroup API_Modularized_Scheduler
-	 Attaches component \p child to parent \p parent. Some component may accept only one child, others accept several (e.g. MCT)
-
-@name Generic Scheduling Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_create(struct starpu_sched_tree *tree, const char *name)
-\ingroup API_Modularized_Scheduler
-	 allocate and initialize component field with defaults values :
-	.pop_task make recursive call on father
-	.estimated_load compute relative speedup and tasks in sub tree
-	.estimated_end return the minimum of recursive call on children
-	.add_child is starpu_sched_component_add_child
-	.remove_child is starpu_sched_component_remove_child
-	.notify_change_workers does nothing
-	.deinit_data does nothing
-
-\fn void starpu_sched_component_destroy(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 free data allocated by starpu_sched_component_create and call component->deinit_data(component)
-	 set to <c>NULL</c> the member starpu_sched_component::fathers[sched_ctx_id] of all child if its equal to \p component
-
-\fn void starpu_sched_component_destroy_rec(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 recursively destroy non shared parts of a \p component 's tree
-
-\fn int starpu_sched_component_can_execute_task(struct starpu_sched_component *component, struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component can execute \p task, this function take into account the workers available in the scheduling context
-
-\fn int starpu_sched_component_execute_preds(struct starpu_sched_component *component, struct starpu_task *task, double *length)
-\ingroup API_Modularized_Scheduler
-	 return a non <c>NULL</c> value if \p component can execute \p task.
-	 write the execution prediction length for the best implementation of the best worker available and write this at \p length address.
-	 this result is more relevant if starpu_sched_component::is_homogeneous is non <c>NULL</c>.
-	 if a worker need to be calibrated for an implementation, nan is set to \p length.
-
-\fn double starpu_sched_component_transfer_length(struct starpu_sched_component *component, struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-	 return the average time to transfer \p task data to underlying \p component workers.
-
-@name Worker Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_worker_get(unsigned sched_ctx, int workerid)
-\ingroup API_Modularized_Scheduler
-	 return the struct starpu_sched_component corresponding to \p workerid. Undefined if \p workerid is not a valid workerid
-
-\fn struct starpu_sched_component *starpu_sched_component_parallel_worker_create(struct starpu_sched_tree *tree, unsigned nworkers, unsigned *workers)
-\ingroup API_Modularized_Scheduler
-	 Create a combined worker that pushes tasks in parallel to workers \p workers (size \p nworkers).
-
-\fn int starpu_sched_component_worker_get_workerid(struct starpu_sched_component *worker_component)
-\ingroup API_Modularized_Scheduler
-	 return the workerid of \p worker_component, undefined if starpu_sched_component_is_worker(worker_component) == 0
-
-\fn int starpu_sched_component_is_worker(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a worker component
-
-\fn int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a simple worker component
-
-\fn int starpu_sched_component_is_combined_worker(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a combined worker component
-
-\fn void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-	 update predictions for workers
-
-\fn void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-@name Flow-control Fifo Component API
-\ingroup API_Modularized_Scheduler
-
-\fn int starpu_sched_component_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to)
-\ingroup API_Modularized_Scheduler
-default function for the can_push component method, just calls can_push of parents until one of them returns non-zero
-
-\fn int starpu_sched_component_can_pull(struct starpu_sched_component * component)
-\ingroup API_Modularized_Scheduler
-default function for the can_pull component method, just calls can_pull of children until one of them returns non-zero
-
-\fn int starpu_sched_component_can_pull_all(struct starpu_sched_component * component)
-\ingroup API_Modularized_Scheduler
-function for the can_pull component method, calls can_pull of all children
-
-\fn double starpu_sched_component_estimated_load(struct starpu_sched_component * component);
-\ingroup API_Modularized_Scheduler
-default function for the estimated_load component method, just sums up the loads
-of the children of the component.
-
-\fn double starpu_sched_component_estimated_end_min(struct starpu_sched_component * component);
-\ingroup API_Modularized_Scheduler
-function that can be used for the estimated_end component method, which just computes the minimum completion time of the children.
-
-\fn double starpu_sched_component_estimated_end_min_add(struct starpu_sched_component * component, double exp_len);
-\ingroup API_Modularized_Scheduler
-function that can be used for the estimated_end component method, which computes
-the minimum completion time of the children, and adds to it an estimation of how
-existing queued work, plus the exp_len work, can be completed. This is typically
-used instead of starpu_sched_component_estimated_end_min when the component
-contains a queue of tasks, which thus needs to be added to the estimations.
-
-\fn double starpu_sched_component_estimated_end_average(struct starpu_sched_component * component);
-\ingroup API_Modularized_Scheduler
-default function for the estimated_end component method, which just computes the average completion time of the children.
-
-
-\struct starpu_sched_component_fifo_data
-\ingroup API_Modularized_Scheduler
-\var unsigned starpu_sched_component_fifo_data::ntasks_threshold
-todo
-\var double starpu_sched_component_fifo_data::exp_len_threshold
-todo
-
-\fn struct starpu_sched_component *starpu_sched_component_fifo_create(struct starpu_sched_tree *tree, struct starpu_sched_component_fifo_data *fifo_data)
-\ingroup API_Modularized_Scheduler
-	 Return a struct starpu_sched_component with a fifo. A stable sort is performed according to tasks priorities.
-	 A push_task call on this component does not perform recursive calls, underlying components will have to call pop_task to get it.
-	 starpu_sched_component::estimated_end function compute the estimated length by dividing the sequential length by the number of underlying workers.
-
-\fn int starpu_sched_component_is_fifo(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a fifo component
-
-@name Flow-control Prio Component API
-\ingroup API_Modularized_Scheduler
-
-\struct starpu_sched_component_prio_data
-\ingroup API_Modularized_Scheduler
-\var unsigned starpu_sched_component_prio_data::ntasks_threshold
-todo
-\var double starpu_sched_component_prio_data::exp_len_threshold
-todo
-
-\fn struct starpu_sched_component *starpu_sched_component_prio_create(struct starpu_sched_tree *tree, struct starpu_sched_component_prio_data *prio_data)
-\ingroup API_Modularized_Scheduler
-todo
-
-\fn int starpu_sched_component_is_prio(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Resource-mapping Work-Stealing Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_work_stealing_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-	 return a component that perform a work stealing scheduling. Tasks are pushed in a round robin way. estimated_end return the average of expected length of fifos, starting at the average of the expected_end of his children. When a worker have to steal a task, it steal a task in a round robin way, and get the last pushed task of the higher priority.
-
-\fn int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-	 undefined if there is no work stealing component in the scheduler. If any, \p task is pushed in a default way if the caller is the application, and in the caller's fifo if its a worker.
-
-\fn int starpu_sched_component_is_work_stealing(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a work stealing component
-
-@name Resource-mapping Random Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_random_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-	 create a component that perform a random scheduling
-
-\fn int starpu_sched_component_is_random(struct starpu_sched_component *)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a random component
-
-@name Resource-mapping Eager Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_eager_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-todo
-
-\fn int starpu_sched_component_is_eager(struct starpu_sched_component *)
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Resource-mapping Eager-Calibration Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_eager_calibration_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-todo
-
-\fn int starpu_sched_component_is_eager_calibration(struct starpu_sched_component *)
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Resource-mapping MCT Component API
-\ingroup API_Modularized_Scheduler
-
-\struct starpu_sched_component_mct_data
-\ingroup API_Modularized_Scheduler
-\var double starpu_sched_component_mct_data::alpha
-todo
-\var double starpu_sched_component_mct_data::beta
-todo
-\var double starpu_sched_component_mct_data::_gamma
-todo
-\var double starpu_sched_component_mct_data::idle_power
-todo
-
-\fn struct starpu_sched_component *starpu_sched_component_mct_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data)
-\ingroup API_Modularized_Scheduler
-create a component with mct_data paremeters. the mct component doesnt
-do anything but pushing tasks on no_perf_model_component and
-calibrating_component
-
-\fn int starpu_sched_component_is_mct(struct starpu_sched_component *component);
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Resource-mapping Heft Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_heft_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data)
-\ingroup API_Modularized_Scheduler
-	 this component perform a heft scheduling
-
-\fn int starpu_sched_component_is_heft(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a heft component
-
-@name Special-purpose Best_Implementation Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_best_implementation_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-	 Select the implementation that offer the shortest computation length for the first worker that can execute the task.
-	 Or an implementation that need to be calibrated.
-	 Also set starpu_task::predicted and starpu_task::predicted_transfer for memory component of the first suitable workerid.
-	 If starpu_sched_component::push method is called and starpu_sched_component::nchild > 1 the result is undefined.
-
-@name Special-purpose Perfmodel_Select Component API
-\ingroup API_Modularized_Scheduler
-
-\struct starpu_sched_component_perfmodel_select_data
-\ingroup API_Modularized_Scheduler
-\var struct starpu_sched_component *starpu_sched_component_perfmodel_select_data::calibrator_component
-todo
-\var struct starpu_sched_component *starpu_sched_component_perfmodel_select_data::no_perfmodel_component
-todo
-\var struct starpu_sched_component *starpu_sched_component_perfmodel_select_data::perfmodel_component
-todo
-
-\fn struct starpu_sched_component *starpu_sched_component_perfmodel_select_create(struct starpu_sched_tree *tree, struct starpu_sched_component_perfmodel_select_data *perfmodel_select_data)
-\ingroup API_Modularized_Scheduler
-todo
-
-\fn int starpu_sched_component_is_perfmodel_select(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Recipe Component API
-\ingroup API_Modularized_Scheduler
-
-\struct starpu_sched_component_composed_recipe
-\ingroup API_Modularized_Scheduler
-	parameters for starpu_sched_component_composed_component_create
-
-\fn struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create(void)
-\ingroup API_Modularized_Scheduler
-	 return an empty recipe for a composed component, it should not be used without modification
-
-\fn struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create_singleton(struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg)
-\ingroup API_Modularized_Scheduler
-	 return a recipe to build a composed component with a \p create_component
-
-\fn void starpu_sched_component_composed_recipe_add(struct starpu_sched_component_composed_recipe *recipe, struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg)
-\ingroup API_Modularized_Scheduler
-	 add \p create_component under all previous components in recipe
-
-\fn void starpu_sched_component_composed_recipe_destroy(struct starpu_sched_component_composed_recipe *)
-\ingroup API_Modularized_Scheduler
-	 destroy composed_sched_component, this should be done after starpu_sched_component_composed_component_create was called
-
-\fn struct starpu_sched_component *starpu_sched_component_composed_component_create(struct starpu_sched_tree *tree, struct starpu_sched_component_composed_recipe *recipe)
-\ingroup API_Modularized_Scheduler
-	 create a component that behave as all component of recipe where linked. Except that you cant use starpu_sched_component_is_foo function
-	 if recipe contain a single create_foo arg_foo pair, create_foo(arg_foo) is returned instead of a composed component
-
-\struct starpu_sched_component_specs
-\ingroup API_Modularized_Scheduler
-	 Define how build a scheduler according to topology. Each level (except for hwloc_machine_composed_sched_component) can be <c>NULL</c>, then
-	 the level is just skipped. Bugs everywhere, do not rely on.
-\var struct starpu_sched_component_composed_recipe *starpu_sched_specs::hwloc_machine_composed_sched_component
-     the composed component to put on the top of the scheduler
-     this member must not be <c>NULL</c> as it is the root of the topology
-\var struct starpu_sched_component_composed_recipe *starpu_sched_specs::hwloc_component_composed_sched_component
-     the composed component to put for each memory component
-\var struct starpu_sched_component_composed_recipe *starpu_sched_specs::hwloc_socket_composed_sched_component
-     the composed component to put for each socket
-\var struct starpu_sched_component_composed_recipe *starpu_sched_specs::hwloc_cache_composed_sched_component
-     the composed component to put for each cache
-\var struct starpu_sched_component_composed_recipe *(*starpu_sched_specs::worker_composed_sched_component)(enum starpu_worker_archtype archtype)
-     a function that return a starpu_sched_component_composed_recipe to put on top of a worker of type \p archtype.
-     <c>NULL</c> is a valid return value, then no component will be added on top
-\var starpu_sched_specs::mix_heterogeneous_workers
-     this flag is a dirty hack because of the poor expressivity of this interface. As example, if you want to build
-     a heft component with a fifo component per numa component, and you also have GPUs, if this flag is set, GPUs will share those fifos.
-     If this flag is not set, a new fifo will be built for each of them (if they have the same starpu_perf_arch and the same
-     numa component it will be shared. it indicates if heterogenous workers should be brothers or cousins, as example, if a gpu and a cpu should share or not there numa node
-
-\fn struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_ctx_id, struct starpu_sched_component_specs s)
-\ingroup API_Modularized_Scheduler
-	 this function build a scheduler for \p sched_ctx_id according to \p s and the hwloc topology of the machine.
-
-\def STARPU_SCHED_SIMPLE_DECIDE_WORKERS
-\ingroup API_Modularized_Scheduler
-Request to create downstream queues per worker, i.e. the scheduling decision-making component will choose exactly which workers tasks should got to.
-
-\def STARPU_SCHED_SIMPLE_COMBINED_WORKERS
-\ingroup API_Modularized_Scheduler
-Request to not only choose between simple workers, but also choose between combined workers.
-
-\def STARPU_SCHED_SIMPLE_DECIDE_MEMNODES
-\ingroup API_Modularized_Scheduler
-Request to create downstream queues per memory nodes, i.e. the scheduling decision-making component will choose which memory node tasks will go to.
-
-\def STARPU_SCHED_SIMPLE_DECIDE_ARCHS
-\ingroup API_Modularized_Scheduler
-Request to create downstream queues per computation arch, i.e. the scheduling decision-making component will choose whether tasks go to CPUs, or CUDA, or OpenCL, etc.
-
-\def STARPU_SCHED_SIMPLE_PERFMODEL
-\ingroup API_Modularized_Scheduler
-Request to add a perfmodel selector above the scheduling decision-making component. That way, only tasks with a calibrated performance model will be given to the component, other tasks will go to an eager branch that will distributed tasks so that their performance models will get calibrated.
-
-In other words, this is needed when using a component which needs performance models for tasks.
-
-\def STARPU_SCHED_SIMPLE_FIFO_ABOVE
-\ingroup API_Modularized_Scheduler
-Request to create a fifo above the scheduling decision-making component, otherwise tasks will be pushed directly to the component.
-
-This is useful to store tasks if there is a fifo below which limits the number of tasks to be scheduld in advance. The scheduling decision-making component can also store tasks itself, in which case this flag is not useful.
-
-\def STARPU_SCHED_SIMPLE_FIFO_ABOVE_PRIO
-\ingroup API_Modularized_Scheduler
-Request that the fifo above be sorted by priorities
-
-\def STARPU_SCHED_SIMPLE_FIFOS_BELOW
-\ingroup API_Modularized_Scheduler
-Request to create fifos below the scheduling decision-making component, otherwise tasks will be pulled directly from workers.
-
-This is useful to be able to schedule a (tunable) small number of tasks in advance only.
-
-\def STARPU_SCHED_SIMPLE_FIFOS_BELOW_PRIO
-\ingroup API_Modularized_Scheduler
-Request that the fifos below be sorted by priorities
-
-\def STARPU_SCHED_SIMPLE_WS_BELOW
-\ingroup API_Modularized_Scheduler
-Request that work between workers using the same fifo below be distributed using a work stealing component.
-
-\def STARPU_SCHED_SIMPLE_IMPL
-\ingroup API_Modularized_Scheduler
-Request that a component be added just above workers, that chooses the best task implementation.
-
-\fn void starpu_sched_component_initialize_simple_scheduler(starpu_sched_component_create_t create_decision_component, void *data, unsigned flags, unsigned sched_ctx_id)
-\ingroup API_Modularized_Scheduler
-This creates a simple modular scheduler tree around a scheduling decision-making
-component \p component. The details of what should be built around \p component
-is described by \p flags. The different STARPU_SCHED_SIMPL_DECIDE_* flags are
-mutually exclusive. \p data is passed to the \p create_decision_component
-function when creating the decision component.
-
-\fn int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-Push a task to a component. This is a helper for <c>component->push_task(component, task)</c> plus tracing.
-
-\fn struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to)
-\ingroup API_Modularized_Scheduler
-Pull a task from a component. This is a helper for <c>component->pull_task(component)</c> plus tracing.
-
-*/

+ 0 - 349
doc/doxygen/chapters/api/performance_model.doxy

@@ -1,349 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2011-2013,2016                           Inria
- * Copyright (C) 2010-2017                                CNRS
- * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \defgroup API_Performance_Model Performance Model
-
-\enum starpu_perfmodel_type
-\ingroup API_Performance_Model
-TODO
-\var starpu_perfmodel_type::STARPU_PERFMODEL_INVALID
-    todo
-\var starpu_perfmodel_type::STARPU_PER_ARCH
-    Application-provided per-arch cost model function
-\var starpu_perfmodel_type::STARPU_COMMON
-    Application-provided common cost model function, with per-arch
-    factor
-\var starpu_perfmodel_type::STARPU_HISTORY_BASED
-    Automatic history-based cost model
-\var starpu_perfmodel_type::STARPU_REGRESSION_BASED
-    Automatic linear regression-based cost model  (alpha * size ^
-    beta)
-\var starpu_perfmodel_type::STARPU_NL_REGRESSION_BASED
-    Automatic non-linear regression-based cost model (a * size ^ b +
-    c)
-\var starpu_perfmodel_type::STARPU_MULTIPLE_REGRESSION_BASED
-    Automatic multiple linear regression-based cost model. Application
-    provides parameters, their combinations and exponents.
-
-\struct starpu_perfmodel_device
-todo
-\ingroup API_Performance_Model
-\var enum starpu_worker_archtype starpu_perfmodel_device::type
-    type of the device
-\var int starpu_perfmodel_device::devid
-    identifier of the precise device
-\var int starpu_perfmodel_device::ncore
-    number of execution in parallel, minus 1
-
-\struct starpu_perfmodel_arch
-todo
-\ingroup API_Performance_Model
-\var int starpu_perfmodel_arch::ndevices
-    number of the devices for the given arch
-\var struct starpu_perfmodel_device *starpu_perfmodel_arch::devices
-    list of the devices for the given arch
-
-\struct starpu_perfmodel
-Contain all information about a performance model. At least the
-type and symbol fields have to be filled when defining a performance
-model for a codelet. For compatibility, make sure to initialize the
-whole structure to zero, either by using explicit memset, or by
-letting the compiler implicitly do it in e.g. static storage case. If
-not provided, other fields have to be zero.
-\ingroup API_Performance_Model
-\var enum starpu_perfmodel_type starpu_perfmodel::type
-    type of performance model
-    <ul>
-    <li>
-    ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED,
-    ::STARPU_NL_REGRESSION_BASED: No other fields needs to be
-    provided, this is purely history-based.
-    </li>
-    <li>
-    ::STARPU_MULTIPLE_REGRESSION_BASED: Need to provide fields
-    starpu_perfmodel::nparameters (number of different parameters),
-    starpu_perfmodel::ncombinations (number of parameters
-    combinations-tuples) and table starpu_perfmodel::combinations
-    which defines exponents of the equation. Function cl_perf_func
-    also needs to define how to extract parameters from the task. 
-    </li>
-    <li>
-    ::STARPU_PER_ARCH: either field
-    starpu_perfmodel::arch_cost_function has to be filled with a
-    function that returns the cost in micro-seconds on the arch given
-    as parameter, or field starpu_perfmodel::per_arch has to be filled
-    with functions which return the cost in micro-seconds.
-    </li>
-    <li>
-    ::STARPU_COMMON: field starpu_perfmodel::cost_function has to be
-    filled with a function that returns the cost in micro-seconds on a
-    CPU, timing on other archs will be determined by multiplying by an
-    arch-specific factor.
-    </li>
-    </ul>
-\var const char *starpu_perfmodel::symbol
-    symbol name for the performance model, which will be used as file
-    name to store the model. It must be set otherwise the model will
-    be ignored.
-\var double (*starpu_perfmodel::cost_function)(struct starpu_task *, unsigned nimpl)
-    Used by ::STARPU_COMMON. Take a task and implementation number,
-    and must return a task duration estimation in micro-seconds.
-\var double (*starpu_perfmodel::arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl)
-    Used by ::STARPU_COMMON. Take a task, an arch and implementation
-    number, and must return a task duration estimation in
-    micro-seconds on that arch.
-\var size_t (*starpu_perfmodel::size_base)(struct starpu_task *, unsigned nimpl)
-    Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
-    ::STARPU_NL_REGRESSION_BASED. If not <c>NULL</c>, take a task and
-    implementation number, and return the size to be used as index to
-    distinguish histories and as a base for regressions.
-\var uint32_t (*starpu_perfmodel::footprint)(struct starpu_task *)
-    Used by ::STARPU_HISTORY_BASED. If not <c>NULL</c>, take a task
-    and return the footprint to be used as index to distinguish
-    histories. The default is to use the starpu_task_data_footprint()
-    function.
-\var unsigned starpu_perfmodel::is_loaded
-\private
-    Whether the performance model is already loaded from the disk.
-\var unsigned starpu_perfmodel::benchmarking
-\private
-    todo
-\var unsigned starpu_perfmodel::is_init
-    todo
-\var starpu_perfmodel_state_t starpu_perfmodel::state
-\private
-    todo
-\var void (*starpu_perfmodel::parameters)(struct starpu_task * task, double *parameters);
-    todo
-\var const char ** starpu_perfmodel::parameters_names
-\private
-    Names of parameters used for multiple linear regression models (M,
-    N, K)
-\var unsigned starpu_perfmodel::nparameters
-\private
-    Number of parameters used for multiple linear regression models
-\var unsigned ** starpu_perfmodel::combinations
-\private
-    Table of combinations of parameters (and the exponents) used for
-    multiple linear regression models
-\var unsigned starpu_perfmodel::ncombinations
-\private
-    Number of combination of parameters used for multiple linear
-    regression models
-
-\struct starpu_perfmodel_regression_model
-todo
-\ingroup API_Performance_Model
-\var double starpu_perfmodel_regression_model::sumlny
-    sum of ln(measured)
-\var double starpu_perfmodel_regression_model::sumlnx
-    sum of ln(size)
-\var double starpu_perfmodel_regression_model::sumlnx2
-    sum of ln(size)^2
-\var unsigned long starpu_perfmodel_regression_model::minx
-    minimum size
-\var unsigned long starpu_perfmodel_regression_model::maxx
-    maximum size
-\var double starpu_perfmodel_regression_model::sumlnxlny
-    sum of ln(size)*ln(measured)
-\var double starpu_perfmodel_regression_model::alpha
-    estimated = alpha * size ^ beta
-\var double starpu_perfmodel_regression_model::beta
-    estimated = alpha * size ^ beta
-\var unsigned starpu_perfmodel_regression_model::valid
-    whether the linear regression model is valid (i.e. enough measures)
-\var double starpu_perfmodel_regression_model::a
-    estimated = a size ^b + c
-\var double starpu_perfmodel_regression_model::b
-    estimated = a size ^b + c
-\var double starpu_perfmodel_regression_model::c
-    estimated = a size ^b + c
-\var unsigned starpu_perfmodel_regression_model::nl_valid
-    whether the non-linear regression model is valid (i.e. enough measures)
-\var unsigned starpu_perfmodel_regression_model::nsample
-    number of sample values for non-linear regression
-\var double starpu_perfmodel_regression_model::coeff[]
-    list of computed coefficients for multiple linear regression model
-\var double starpu_perfmodel_regression_model::ncoeff
-    number of coefficients for multiple linear regression model
-\var double starpu_perfmodel_regression_model::multi_valid
-    whether the multiple linear regression model is valid
-
-\struct starpu_perfmodel_per_arch
-contains information about the performance model of a given
-arch.
-\ingroup API_Performance_Model
-\var starpu_perfmodel_per_arch_cost_function starpu_perfmodel_per_arch::cost_function
-    Used by ::STARPU_PER_ARCH, must point to functions which take a
-    task, the target arch and implementation number (as mere
-    conveniency, since the array is already indexed by these), and
-    must return a task duration estimation in micro-seconds.
-\var starpu_perfmodel_per_arch_size_base starpu_perfmodel_per_arch::size_base
-    Same as in structure starpu_perfmodel, but per-arch, in case it
-    depends on the architecture-specific implementation.
-\var struct starpu_perfmodel_history_table *starpu_perfmodel_per_arch::history
-\private
-    The history of performance measurements.
-\var struct starpu_perfmodel_history_list *starpu_perfmodel_per_arch::list
-\private
-    Used by ::STARPU_HISTORY_BASED, ::STARPU_NL_REGRESSION_BASED and
-    ::STARPU_MULTIPLE_REGRESSION_BASED, records all execution history
-    measures.
-\var struct starpu_perfmodel_regression_model starpu_perfmodel_per_arch::regression
-\private
-    Used by ::STARPU_REGRESSION_BASED, ::STARPU_NL_REGRESSION_BASED
-    and ::STARPU_MULTIPLE_REGRESSION_BASED, contains the estimated
-    factors of the regression.
-
-\struct starpu_perfmodel_history_list
-todo
-\ingroup API_Performance_Model
-\var struct starpu_perfmodel_history_list *starpu_perfmodel_history_list::next
-    todo
-\var struct starpu_perfmodel_history_entry *starpu_perfmodel_history_list::entry
-    todo
-
-\struct starpu_perfmodel_history_entry
-todo
-\ingroup API_Performance_Model
-\var double starpu_perfmodel_history_entry::mean
-    mean_n = 1/n sum
-\var double starpu_perfmodel_history_entry::deviation
-    n dev_n = sum2 - 1/n (sum)^2
-\var double starpu_perfmodel_history_entry::sum
-    sum of samples (in µs)
-\var double starpu_perfmodel_history_entry::sum2
-    sum of samples^2
-\var unsigned starpu_perfmodel_history_entry::nsample
-    number of samples
-\var uint32_t starpu_perfmodel_history_entry::footprint
-    data footprint
-\var size_t starpu_perfmodel_history_entry::size
-    in bytes
-\var double starpu_perfmodel_history_entry::flops
-    Provided by the application
-
-\fn void starpu_perfmodel_init(struct starpu_perfmodel *model)
-\ingroup API_Performance_Model
-todo
-
-\fn void starpu_perfmodel_free_sampling_directories(void)
-\ingroup API_Performance_Model
-Free internal memory used for sampling directory
-management. It should only be called by an application which is not
-calling starpu_shutdown() as this function already calls it. See for
-example <c>tools/starpu_perfmodel_display.c</c>.
-
-\fn int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *model)
-\ingroup API_Performance_Model
-Load the performance model found in the file named \p filename. \p model has to be
-completely zero, and will be filled with the information stored in the given file.
-
-\fn int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
-\ingroup API_Performance_Model
-Load a given performance model. \p model has to be
-completely zero, and will be filled with the information stored in
-<c>$STARPU_HOME/.starpu</c>. The function is intended to be used by
-external tools that want to read the performance model files.
-
-\fn int starpu_perfmodel_unload_model(struct starpu_perfmodel *model)
-\ingroup API_Performance_Model
-Unload \p model which has been previously loaded
-through the function starpu_perfmodel_load_symbol()
-
-\fn void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl)
-\ingroup API_Performance_Model
-Return the path to the debugging information for the performance model.
-
-\fn char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
-\ingroup API_Performance_Model
-todo
-
-\fn void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl)
-\ingroup API_Performance_Model
-Return the architecture name for \p arch
-
-\fn struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id)
-\ingroup API_Performance_Model
-Return the architecture type of the worker \p workerid.
-
-\fn void starpu_perfmodel_initialize(void)
-\ingroup API_Performance_Model
-If starpu_init is not used, starpu_perfmodel_initialize should be used before calling starpu_perfmodel_* functions.
-
-\fn int starpu_perfmodel_list(FILE *output)
-\ingroup API_Performance_Model
-Print a list of all performance models on \p output
-
-\fn void starpu_perfmodel_directory(FILE *output)
-\ingroup API_Performance_Model
-Print the directory name storing performance models on \p output
-
-\fn void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
-\ingroup API_Performance_Model
-todo
-
-\fn int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output)
-\ingroup API_Performance_Model
-todo
-
-\fn int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t footprint, FILE *output)
-\ingroup API_Performance_Model
-todo
-
-\fn void starpu_bus_print_bandwidth(FILE *f)
-\ingroup API_Performance_Model
-Print a matrix of bus bandwidths on \p f.
-
-\fn void starpu_bus_print_affinity(FILE *f)
-\ingroup API_Performance_Model
-Print the affinity devices on \p f.
-
-\fn void starpu_bus_print_filenames(FILE *f)
-\ingroup API_Performance_Model
-Print on \p f the name of the files containing the matrix of bus bandwidths, the affinity devices and the latency.
-
-\fn void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
-\ingroup API_Performance_Model
-Feed the performance model model with an explicit
-measurement measured (in µs), in addition to measurements done by StarPU
-itself. This can be useful when the application already has an
-existing set of measurements done in good conditions, that StarPU
-could benefit from instead of doing on-line measurements. An example
-of use can be seen in \ref PerformanceModelExample.
-
-\fn double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
-\ingroup API_Performance_Model
-Return the bandwidth of data transfer between two memory nodes
-
-\fn double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
-\ingroup API_Performance_Model
-Return the latency of data transfer between two memory nodes
-
-\fn double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
-\ingroup API_Performance_Model
-Return the estimated time to transfer a given size between two memory nodes.
-
-\fn double starpu_perfmodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint)
-\ingroup API_Performance_Model
-Return the estimated time of a task with the given model and the given footprint.
-
-\var starpu_perfmodel_nop
-Performance model which just always return 1µs.
-
-*/

+ 0 - 202
doc/doxygen/chapters/api/profiling.doxy

@@ -1,202 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010-2015,2017                           CNRS
- * Copyright (C) 2009-2011,2014,2016,2018-2019            Université de Bordeaux
- * Copyright (C) 2011,2012                                Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \defgroup API_Profiling Profiling
-
-\struct starpu_profiling_task_info
-\ingroup API_Profiling
-This structure contains information about the execution of a
-task. It is accessible from the field starpu_task::profiling_info if
-profiling was enabled.
-\var struct timespec starpu_profiling_task_info::submit_time
-    Date of task submission (relative to the initialization of StarPU).
-
-\var struct timespec starpu_profiling_task_info::push_start_time
-    Time when the task was submitted to the scheduler.
-
-\var struct timespec starpu_profiling_task_info::push_end_time
-    Time when the scheduler finished with the task submission.
-
-\var struct timespec starpu_profiling_task_info::pop_start_time
-    Time when the scheduler started to be requested for a task, and eventually gave that task.
-
-\var struct timespec starpu_profiling_task_info::pop_end_time
-    Time when the scheduler finished providing the task for execution.
-
-\var struct timespec starpu_profiling_task_info::acquire_data_start_time
-    Time when the worker started fetching input data.
-
-\var struct timespec starpu_profiling_task_info::acquire_data_end_time
-    Time when the worker finished fetching input data.
-
-\var struct timespec starpu_profiling_task_info::start_time
-    Date of task execution beginning (relative to the initialization of StarPU).
-
-\var struct timespec starpu_profiling_task_info::end_time
-    Date of task execution termination (relative to the initialization of StarPU).
-
-\var struct timespec starpu_profiling_task_info::release_data_start_time
-    Time when the worker started releasing data.
-
-\var struct timespec starpu_profiling_task_info::release_data_end_time
-    Time when the worker finished releasing data.
-
-\var struct timespec starpu_profiling_task_info::callback_start_time
-    Time when the worker started the application callback for the task.
-
-\var struct timespec starpu_profiling_task_info::callback_end_time
-    Time when the worker finished the application callback for the task.
-
-\var int starpu_profiling_task_info::workerid
-    Identifier of the worker which has executed the task.
-
-\var uint64_t starpu_profiling_task_info::used_cycles
-    Number of cycles used by the task, only available in the MoviSim
-
-\var uint64_t starpu_profiling_task_info::stall_cycles
-    Number of cycles stalled within the task, only available in the MoviSim
-
-\var double starpu_profiling_task_info::energy_consumed
-Energy consumed by the task, in Joules
-
-\struct starpu_profiling_worker_info
-This structure contains the profiling information associated to
-a worker. The timing is provided since the previous call to
-starpu_profiling_worker_get_info()
-\ingroup API_Profiling
-\var struct timespec starpu_profiling_worker_info::start_time
-        Starting date for the reported profiling measurements.
-\var struct timespec starpu_profiling_worker_info::total_time
-        Duration of the profiling measurement interval.
-\var struct timespec starpu_profiling_worker_info::executing_time
-        Time spent by the worker to execute tasks during the profiling measurement interval.
-\var struct timespec starpu_profiling_worker_info::sleeping_time
-        Time spent idling by the worker during the profiling measurement interval.
-\var int starpu_profiling_worker_info::executed_tasks
-        Number of tasks executed by the worker during the profiling measurement interval.
-\var uint64_t starpu_profiling_worker_info::used_cycles
-        Number of cycles used by the worker, only available in the MoviSim
-\var uint64_t starpu_profiling_worker_info::stall_cycles
-        Number of cycles stalled within the worker, only available in the MoviSim
-\var double starpu_profiling_worker_info::energy_consumed
-        Energy consumed by the worker, in Joules
-
-\struct starpu_profiling_bus_info
-todo
-\ingroup API_Profiling
-\var struct timespec starpu_profiling_bus_info::start_time
-        Time of bus profiling startup.
-\var struct timespec starpu_profiling_bus_info::total_time
-        Total time of bus profiling.
-\var int long long starpu_profiling_bus_info::transferred_bytes
-        Number of bytes transferred during profiling.
-\var int starpu_profiling_bus_info::transfer_count
-        Number of transfers during profiling.
-
-\typedef STARPU_PROFILING_DISABLE
-\ingroup API_Profiling
-Used when calling the function starpu_profiling_status_set() to disable profiling.
-
-\typedef STARPU_PROFILING_ENABLE
-\ingroup API_Profiling
-Used when calling the function starpu_profiling_status_set() to enable profiling.
-
-\fn int starpu_profiling_status_set(int status)
-\ingroup API_Profiling
-Set the profiling status. Profiling is activated
-by passing \ref STARPU_PROFILING_ENABLE in \p status. Passing
-\ref STARPU_PROFILING_DISABLE disables profiling. Calling this function
-resets all profiling measurements. When profiling is enabled, the
-field starpu_task::profiling_info points to a valid structure
-starpu_profiling_task_info containing information about the execution
-of the task. Negative return values indicate an error, otherwise the
-previous status is returned.
-
-\fn int starpu_profiling_status_get(void)
-\ingroup API_Profiling
-Return the current profiling status or a negative value in case
-there was an error.
-
-\fn void starpu_profiling_init(void)
-\ingroup API_Profiling
-Reset performance counters and enable profiling if the
-environment variable \ref STARPU_PROFILING is set to a positive value.
-
-\fn void starpu_profiling_set_id(int new_id)
-\ingroup API_Profiling
-Set the ID used for profiling trace filename. HAS to be called before starpu_init().
-
-\fn int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info)
-\ingroup API_Profiling
-Get the profiling info associated to the worker identified by
-\p workerid, and reset the profiling measurements. If the argument \p
-worker_info is <c>NULL</c>, only reset the counters associated to worker
-\p workerid. Upon successful completion, this function returns 0.
-Otherwise, a negative value is returned.
-
-\fn int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info)
-\ingroup API_Profiling
-
-todo
-
-See _starpu_profiling_bus_helper_display_summary in src/profiling/profiling_helpers.c for a usage example.
-Note that calling starpu_bus_get_profiling_info resets the counters to zero.
-
-\fn int starpu_bus_get_count(void)
-\ingroup API_Profiling
-Return the number of buses in the machine
-
-\fn int starpu_bus_get_id(int src, int dst)
-\ingroup API_Profiling
-Return the identifier of the bus between \p src and \p dst
-
-\fn int starpu_bus_get_src(int busid)
-\ingroup API_Profiling
-Return the source point of bus \p busid
-
-\fn int starpu_bus_get_dst(int busid)
-\ingroup API_Profiling
-Return the destination point of bus \p busid
-
-\fn double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end)
-\ingroup API_Profiling
-Return the time elapsed between \p start and \p end in microseconds.
-
-\fn double starpu_timing_timespec_to_us(struct timespec *ts)
-\ingroup API_Profiling
-Convert the given timespec \p ts into microseconds
-
-\fn void starpu_profiling_bus_helper_display_summary(void)
-\ingroup API_Profiling
-Display statistics about the bus on \c stderr. if the environment
-variable \ref STARPU_BUS_STATS is defined. The function is called
-automatically by starpu_shutdown().
-
-\fn void starpu_profiling_worker_helper_display_summary(void)
-\ingroup API_Profiling
-Displays statistic about the workers on \c stderr if the
-environment variable \ref STARPU_WORKER_STATS is defined. The function is
-called automatically by starpu_shutdown().
-
-\fn void starpu_data_display_memory_stats()
-\ingroup API_Profiling
-Display statistics about the current data handles registered
-within StarPU. StarPU must have been configured with the configure
-option \ref enable-memory-stats "--enable-memory-stats" (see \ref MemoryFeedback).
-
-*/

+ 2 - 18
doc/doxygen/chapters/api/scc_extensions.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2019                      CNRS
  * Copyright (C) 2009-2011,2014                           Université de Bordeaux
  * Copyright (C) 2009-2011,2014                           Université de Bordeaux
  * Copyright (C) 2011,2012                                Inria
  * Copyright (C) 2011,2012                                Inria
  *
  *
@@ -16,7 +16,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
-/*! \defgroup API_SCC_Extensions SCC Extensions
+/*! \ingroup API_SCC_Extensions
 
 
 \def STARPU_USE_SCC
 \def STARPU_USE_SCC
 \ingroup API_SCC_Extensions
 \ingroup API_SCC_Extensions
@@ -28,20 +28,4 @@ It should be used in your code to detect the availability of SCC.
 Define the maximum number of SCC devices that are
 Define the maximum number of SCC devices that are
 supported by StarPU.
 supported by StarPU.
 
 
-\typedef starpu_scc_func_symbol_t
-\ingroup API_SCC_Extensions
-Type for SCC function symbols
-
-\fn int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name)
-\ingroup API_SCC_Extensions
-Initiate a lookup on each SCC device to find the adress of the
-function named \p func_name, store them in the global array kernels
-and return the index in the array through \p symbol.
-
-\fn starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t symbol)
-\ingroup API_SCC_Extensions
-If success, return the pointer to the function defined by \p symbol on
-the device linked to the called device. This can for instance be used
-in a starpu_scc_func_symbol_t implementation.
-
 */
 */

+ 5 - 6
include/starpu_bitmap.h

@@ -19,11 +19,10 @@
 #ifndef __STARPU_BITMAP_H__
 #ifndef __STARPU_BITMAP_H__
 #define __STARPU_BITMAP_H__
 #define __STARPU_BITMAP_H__
 
 
-/** @defgroup API_Bitmap Bitmap
-
-    @brief This is the interface for the bitmap utilities provided by StarPU.
-
-    @{
+/**
+   @defgroup API_Bitmap Bitmap
+   @brief This is the interface for the bitmap utilities provided by StarPU.
+   @{
  */
  */
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
@@ -33,7 +32,7 @@ extern "C"
 
 
 /** create a empty starpu_bitmap */
 /** create a empty starpu_bitmap */
 struct starpu_bitmap *starpu_bitmap_create(void) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_bitmap *starpu_bitmap_create(void) STARPU_ATTRIBUTE_MALLOC;
-/** free \b */
+/** free \p b */
 void starpu_bitmap_destroy(struct starpu_bitmap *b);
 void starpu_bitmap_destroy(struct starpu_bitmap *b);
 
 
 /** set bit \p e in \p b */
 /** set bit \p e in \p b */

+ 31 - 20
include/starpu_bound.h

@@ -18,12 +18,11 @@
 #ifndef __STARPU_BOUND_H__
 #ifndef __STARPU_BOUND_H__
 #define __STARPU_BOUND_H__
 #define __STARPU_BOUND_H__
 
 
-/** @defgroup API_Theoretical_Lower_Bound_on_Execution_Time Theoretical Lower Bound on Execution Time
-
-    @brief Compute theoretical upper computation efficiency bound corresponding to some actual execution.
-
-    @{
- */
+/**
+   @defgroup API_Theoretical_Lower_Bound_on_Execution_Time Theoretical Lower Bound on Execution Time
+   @brief Compute theoretical upper computation efficiency bound corresponding to some actual execution.
+   @{
+*/
 
 
 #include <stdio.h>
 #include <stdio.h>
 
 
@@ -32,34 +31,46 @@ extern "C"
 {
 {
 #endif
 #endif
 
 
-/** Start recording tasks (resets stats). \p deps tells whether dependencies should be recorded too (this is quite expensive) */
+/**
+   Start recording tasks (resets stats). \p deps tells whether
+   dependencies should be recorded too (this is quite expensive)
+*/
 void starpu_bound_start(int deps, int prio);
 void starpu_bound_start(int deps, int prio);
-/** Stop recording tasks */
+
+/**
+   Stop recording tasks
+*/
 void starpu_bound_stop(void);
 void starpu_bound_stop(void);
 
 
-/** Emit the DAG that was recorded on \p output. */
+/**
+   Emit the DAG that was recorded on \p output.
+*/
 void starpu_bound_print_dot(FILE *output);
 void starpu_bound_print_dot(FILE *output);
 
 
-/** Get theoretical upper bound (in ms) (needs glpk support
-    detected by configure script). It returns 0 if some performance models
-    are not calibrated.
+/**
+   Get theoretical upper bound (in ms) (needs glpk support detected by
+   configure script). It returns 0 if some performance models are not
+   calibrated.
 */
 */
 void starpu_bound_compute(double *res, double *integer_res, int integer);
 void starpu_bound_compute(double *res, double *integer_res, int integer);
 
 
-/** Emit the Linear Programming system on \p output for the recorded
-    tasks, in the lp format
+/**
+   Emit the Linear Programming system on \p output for the recorded
+   tasks, in the lp format
 */
 */
 void starpu_bound_print_lp(FILE *output);
 void starpu_bound_print_lp(FILE *output);
 
 
-/** Emit the Linear Programming system on \p output for the recorded
-    tasks, in the mps format
+/**
+   Emit the Linear Programming system on \p output for the recorded
+   tasks, in the mps format
 */
 */
 void starpu_bound_print_mps(FILE *output);
 void starpu_bound_print_mps(FILE *output);
 
 
-/** Emit on \p output the statistics of actual execution vs theoretical upper bound.
-    \p integer permits to choose between integer solving (which takes a
-    long time but is correct), and relaxed solving (which provides an
-    approximate solution).
+/**
+   Emit on \p output the statistics of actual execution vs theoretical
+   upper bound. \p integer permits to choose between integer solving
+   (which takes a long time but is correct), and relaxed solving
+   (which provides an approximate solution).
 */
 */
 void starpu_bound_print(FILE *output, int integer);
 void starpu_bound_print(FILE *output, int integer);
 
 

+ 3 - 3
include/starpu_clusters.h

@@ -19,9 +19,9 @@
 #ifndef __STARPU_CLUSTERS_UTIL_H__
 #ifndef __STARPU_CLUSTERS_UTIL_H__
 #define __STARPU_CLUSTERS_UTIL_H__
 #define __STARPU_CLUSTERS_UTIL_H__
 
 
-/** @defgroup API_Clustering_Machine Clustering Machine
-
-    @{
+/**
+   @defgroup API_Clustering_Machine Clustering Machine
+   @{
  */
  */
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC

+ 8 - 7
include/starpu_cublas.h

@@ -18,9 +18,9 @@
 #ifndef __STARPU_CUBLAS_H__
 #ifndef __STARPU_CUBLAS_H__
 #define __STARPU_CUBLAS_H__
 #define __STARPU_CUBLAS_H__
 
 
-/** @ingroup API_CUDA_Extensions
-
-    @{
+/**
+   @ingroup API_CUDA_Extensions
+   @{
  */
  */
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
@@ -38,10 +38,11 @@ extern "C"
 void starpu_cublas_init(void);
 void starpu_cublas_init(void);
 
 
 /**
 /**
-   Set the proper CUBLAS stream for CUBLAS v1. This must be called from the CUDA
-   codelet before calling CUBLAS v1 kernels, so that they are queued on the proper
-   CUDA stream. When using one thread per CUDA worker, this function does not
-   do anything since the CUBLAS stream does not change, and is set once by
+   Set the proper CUBLAS stream for CUBLAS v1. This must be called
+   from the CUDA codelet before calling CUBLAS v1 kernels, so that
+   they are queued on the proper CUDA stream. When using one thread
+   per CUDA worker, this function does not do anything since the
+   CUBLAS stream does not change, and is set once by
    starpu_cublas_init().
    starpu_cublas_init().
 */
 */
 void starpu_cublas_set_stream(void);
 void starpu_cublas_set_stream(void);

+ 5 - 5
include/starpu_cublas_v2.h

@@ -18,9 +18,9 @@
 #ifndef __STARPU_CUBLAS_V2_H__
 #ifndef __STARPU_CUBLAS_V2_H__
 #define __STARPU_CUBLAS_V2_H__
 #define __STARPU_CUBLAS_V2_H__
 
 
-/** @ingroup API_CUDA_Extensions
-
-    @{
+/**
+   @ingroup API_CUDA_Extensions
+   @{
  */
  */
 
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
@@ -33,8 +33,8 @@ extern "C"
 #endif
 #endif
 
 
 /**
 /**
-   Return the CUSPARSE handle to be used to queue CUSPARSE
-   kernels. It is properly initialized and configured for multistream by
+   Return the CUSPARSE handle to be used to queue CUSPARSE kernels. It
+   is properly initialized and configured for multistream by
    starpu_cusparse_init().
    starpu_cusparse_init().
 */
 */
 cublasHandle_t starpu_cublas_get_local_handle(void);
 cublasHandle_t starpu_cublas_get_local_handle(void);

+ 38 - 27
include/starpu_cuda.h

@@ -19,9 +19,9 @@
 #ifndef __STARPU_CUDA_H__
 #ifndef __STARPU_CUDA_H__
 #define __STARPU_CUDA_H__
 #define __STARPU_CUDA_H__
 
 
-/** @defgroup API_CUDA_Extensions CUDA Extensions
-
-    @{
+/**
+   @defgroup API_CUDA_Extensions CUDA Extensions
+   @{
  */
  */
 
 
 #include <starpu_config.h>
 #include <starpu_config.h>
@@ -36,49 +36,60 @@ extern "C"
 {
 {
 #endif
 #endif
 
 
-/** Report a CUBLAS error. */
+/**
+   Report a CUBLAS error.
+*/
 void starpu_cublas_report_error(const char *func, const char *file, int line, int status);
 void starpu_cublas_report_error(const char *func, const char *file, int line, int status);
 
 
-/** Calls starpu_cublas_report_error(), passing the current function, file and line position.*/
+/**
+   Call starpu_cublas_report_error(), passing the current function, file and line position.
+*/
 #define STARPU_CUBLAS_REPORT_ERROR(status) starpu_cublas_report_error(__starpu_func__, __FILE__, __LINE__, status)
 #define STARPU_CUBLAS_REPORT_ERROR(status) starpu_cublas_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
 
-/** Report a CUDA error. */
+/**
+   Report a CUDA error.
+*/
 void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status);
 void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status);
 
 
-/** Calls starpu_cuda_report_error(), passing the current function, file and line position.*/
+/**
+   Call starpu_cuda_report_error(), passing the current function, file and line position.
+*/
 #define STARPU_CUDA_REPORT_ERROR(status) starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
 #define STARPU_CUDA_REPORT_ERROR(status) starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
 
 /**
 /**
-    Return the current worker’s CUDA stream. StarPU
-    provides a stream for every CUDA device controlled by StarPU. This
-    function is only provided for convenience so that programmers can
-    easily use asynchronous operations within codelets without having to
-    create a stream by hand. Note that the application is not forced to
-    use the stream provided by starpu_cuda_get_local_stream() and may also
-    create its own streams. Synchronizing with <c>cudaThreadSynchronize()</c> is
-    allowed, but will reduce the likelihood of having all transfers
-    overlapped.
+   Return the current worker’s CUDA stream. StarPU provides a stream
+   for every CUDA device controlled by StarPU. This function is only
+   provided for convenience so that programmers can easily use
+   asynchronous operations within codelets without having to create a
+   stream by hand. Note that the application is not forced to use the
+   stream provided by starpu_cuda_get_local_stream() and may also
+   create its own streams. Synchronizing with
+   <c>cudaThreadSynchronize()</c> is allowed, but will reduce the
+   likelihood of having all transfers overlapped.
 */
 */
 cudaStream_t starpu_cuda_get_local_stream(void);
 cudaStream_t starpu_cuda_get_local_stream(void);
 
 
-/** Return a pointer to device properties for worker \p workerid (assumed to be a CUDA worker). */
+/**
+   Return a pointer to device properties for worker \p workerid
+   (assumed to be a CUDA worker).
+*/
 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
 
 
 /**
 /**
-    Copy \p ssize bytes from the pointer \p src_ptr on \p src_node
-    to the pointer \p dst_ptr on \p dst_node. The function first tries to
-    copy the data asynchronous (unless \p stream is <c>NULL</c>). If the
-    asynchronous copy fails or if \p stream is <c>NULL</c>, it copies the
-    data synchronously. The function returns <c>-EAGAIN</c> if the
-    asynchronous launch was successfull. It returns 0 if the synchronous
-    copy was successful, or fails otherwise.
+   Copy \p ssize bytes from the pointer \p src_ptr on \p src_node
+   to the pointer \p dst_ptr on \p dst_node. The function first tries to
+   copy the data asynchronous (unless \p stream is <c>NULL</c>). If the
+   asynchronous copy fails or if \p stream is <c>NULL</c>, it copies the
+   data synchronously. The function returns <c>-EAGAIN</c> if the
+   asynchronous launch was successfull. It returns 0 if the synchronous
+   copy was successful, or fails otherwise.
 */
 */
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 
 
 /**
 /**
-    Calls <c>cudaSetDevice(\p devid)</c> or <c>cudaGLSetGLDevice(\p devid)</c>,
-    according to whether \p devid is among the field
-    starpu_conf::cuda_opengl_interoperability.
+   Call <c>cudaSetDevice(\p devid)</c> or <c>cudaGLSetGLDevice(\p devid)</c>,
+   according to whether \p devid is among the field
+   starpu_conf::cuda_opengl_interoperability.
 */
 */
 void starpu_cuda_set_device(unsigned devid);
 void starpu_cuda_set_device(unsigned devid);
 
 

+ 6 - 6
include/starpu_cusparse.h

@@ -18,10 +18,10 @@
 #ifndef __STARPU_CUSPARSE_H__
 #ifndef __STARPU_CUSPARSE_H__
 #define __STARPU_CUSPARSE_H__
 #define __STARPU_CUSPARSE_H__
 
 
-/** @ingroup API_CUDA_Extensions
-
-    @{
- */
+/**
+   @ingroup API_CUDA_Extensions
+   @{
+*/
 
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 #include <cusparse.h>
 #include <cusparse.h>
@@ -40,14 +40,14 @@ extern "C"
 void starpu_cusparse_init(void);
 void starpu_cusparse_init(void);
 
 
 /**
 /**
-   Synchronously deinitialize the CUSPARSE library on
+   @brief Synchronously deinitialize the CUSPARSE library on
    every CUDA device.
    every CUDA device.
 */
 */
 void starpu_cusparse_shutdown(void);
 void starpu_cusparse_shutdown(void);
 
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 /**
 /**
-   Return the CUSPARSE handle to be used to queue CUSPARSE
+   @brief Return the CUSPARSE handle to be used to queue CUSPARSE
    kernels. It is properly initialized and configured for multistream by
    kernels. It is properly initialized and configured for multistream by
    starpu_cusparse_init().
    starpu_cusparse_init().
 */
 */

+ 11 - 12
include/starpu_data.h

@@ -19,14 +19,13 @@
 #ifndef __STARPU_DATA_H__
 #ifndef __STARPU_DATA_H__
 #define __STARPU_DATA_H__
 #define __STARPU_DATA_H__
 
 
-/** @defgroup API_Data_Management Data Management
-
-    @brief Data management facilities provided by StarPU. We show how
-    to use existing data interfaces in \ref API_Data_Interfaces, but
-    developers can design their own data interfaces if required.
-
-    @{
- */
+/**
+   @defgroup API_Data_Management Data Management
+   @brief Data management facilities provided by StarPU. We show how
+   to use existing data interfaces in \ref API_Data_Interfaces, but
+   developers can design their own data interfaces if required.
+   @{
+*/
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
@@ -52,12 +51,12 @@ struct _starpu_data_state;
 typedef struct _starpu_data_state* starpu_data_handle_t;
 typedef struct _starpu_data_state* starpu_data_handle_t;
 
 
 /**
 /**
-    Describe a StarPU data access mode
+   Describe a StarPU data access mode
 
 
-    Note: when adding a flag here, update
-    _starpu_detect_implicit_data_deps_with_handle
+   Note: when adding a flag here, update
+   _starpu_detect_implicit_data_deps_with_handle
 
 
-    Note: other STARPU_* values in include/starpu_task_util.h
+   Note: other STARPU_* values in include/starpu_task_util.h
  */
  */
 enum starpu_data_access_mode
 enum starpu_data_access_mode
 {
 {

+ 89 - 81
include/starpu_data_filters.h

@@ -21,10 +21,10 @@
 #ifndef __STARPU_DATA_FILTERS_H__
 #ifndef __STARPU_DATA_FILTERS_H__
 #define __STARPU_DATA_FILTERS_H__
 #define __STARPU_DATA_FILTERS_H__
 
 
-/** @defgroup API_Data_Partition Data Partition
-
-    @{
- */
+/**
+   @defgroup API_Data_Partition Data Partition
+   @{
+*/
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <stdarg.h>
 #include <stdarg.h>
@@ -36,7 +36,9 @@ extern "C"
 
 
 struct starpu_data_interface_ops;
 struct starpu_data_interface_ops;
 
 
-/** Describe a data partitioning operation, to be given to starpu_data_partition() */
+/**
+   Describe a data partitioning operation, to be given to starpu_data_partition()
+*/
 struct starpu_data_filter
 struct starpu_data_filter
 {
 {
 	/**
 	/**
@@ -99,37 +101,38 @@ struct starpu_data_filter
 	void *filter_arg_ptr;
 	void *filter_arg_ptr;
 };
 };
 
 
-/** @name Basic API
- *
- * @{
- */
+/**
+   @name Basic API
+   @{
+*/
 
 
 /**
 /**
-    Request the partitioning of \p initial_handle into several subdata
-    according to the filter \p f.
-    Here an example of how to use the function.
-    \code{.c}
-    struct starpu_data_filter f =
-    {
-      .filter_func = starpu_matrix_filter_block,
-      .nchildren = nslicesx
-    };
-    starpu_data_partition(A_handle, &f);
+   Request the partitioning of \p initial_handle into several subdata
+   according to the filter \p f.
+
+   Here an example of how to use the function.
+   \code{.c}
+   struct starpu_data_filter f =
+   {
+     .filter_func = starpu_matrix_filter_block,
+     .nchildren = nslicesx
+   };
+   starpu_data_partition(A_handle, &f);
     \endcode
     \endcode
 */
 */
 void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f);
 void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f);
 
 
 /**
 /**
-   Unapply the filter which has been applied to \p root_data, thus
-   unpartitioning the data. The pieces of data are collected back into
-   one big piece in the \p gathering_node (usually ::STARPU_MAIN_RAM).
-   Tasks working on the partitioned data will be waited for
-   by starpu_data_unpartition().
+  Unapply the filter which has been applied to \p root_data, thus
+  unpartitioning the data. The pieces of data are collected back into
+  one big piece in the \p gathering_node (usually ::STARPU_MAIN_RAM).
+  Tasks working on the partitioned data will be waited for
+  by starpu_data_unpartition().
 
 
-   Here an example of how to use the function.
-   \code{.c}
-   starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
-   \endcode
+  Here an example of how to use the function.
+  \code{.c}
+  starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+  \endcode
 */
 */
 void starpu_data_unpartition(starpu_data_handle_t root_data, unsigned gathering_node);
 void starpu_data_unpartition(starpu_data_handle_t root_data, unsigned gathering_node);
 
 
@@ -182,10 +185,10 @@ void starpu_data_vmap_filters(starpu_data_handle_t root_data, unsigned nfilters,
 
 
 /** @} */
 /** @} */
 
 
-/** @name Asynchronous API
- *
- * @{
- */
+/**
+   @name Asynchronous API
+   @{
+*/
 
 
 /**
 /**
    Plan to partition \p initial_handle into several subdata according to
    Plan to partition \p initial_handle into several subdata according to
@@ -303,44 +306,47 @@ void starpu_data_partition_not_automatic(starpu_data_handle_t handle);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Predefined BCSR Filter Functions
- * Predefined partitioning functions for BCSR data. Examples on how to
- * use them are shown in \ref PartitioningData.
- * @{
- */
+/**
+   @name Predefined BCSR Filter Functions
+   Predefined partitioning functions for BCSR data. Examples on how to
+   use them are shown in \ref PartitioningData.
+   @{
+*/
 
 
 /**
 /**
    Partition a block-sparse matrix into dense matrices.
    Partition a block-sparse matrix into dense matrices.
- */
+*/
 void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Predefined CSR Filter Functions
- * Predefined partitioning functions for CSR data. Examples on how to
- * use them are shown in \ref PartitioningData.
- * @{
- */
+/**
+   @name Predefined CSR Filter Functions
+   Predefined partitioning functions for CSR data. Examples on how to
+   use them are shown in \ref PartitioningData.
+   @{
+*/
 
 
 /**
 /**
    Partition a block-sparse matrix into vertical block-sparse matrices.
    Partition a block-sparse matrix into vertical block-sparse matrices.
- */
+*/
 void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Predefined Matrix Filter Functions
- * Predefined partitioning functions for matrix
- * data. Examples on how to use them are shown in \ref
- * PartitioningData.
- * @{
- */
+/**
+   @name Predefined Matrix Filter Functions
+   Predefined partitioning functions for matrix
+   data. Examples on how to use them are shown in \ref
+   PartitioningData.
+   @{
+*/
 
 
 /**
 /**
    Partition a dense Matrix along the x dimension, thus getting (x/\p
    Partition a dense Matrix along the x dimension, thus getting (x/\p
    nparts ,y) matrices. If \p nparts does not divide x, the last
    nparts ,y) matrices. If \p nparts does not divide x, the last
    submatrix contains the remainder.
    submatrix contains the remainder.
- */
+*/
 void starpu_matrix_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_matrix_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /**
 /**
@@ -353,14 +359,14 @@ void starpu_matrix_filter_block(void *father_interface, void *child_interface, s
    only be used for read-only access, as no coherency is enforced for the
    only be used for read-only access, as no coherency is enforced for the
    shadowed parts. A usage example is available in
    shadowed parts. A usage example is available in
    examples/filters/shadow2d.c
    examples/filters/shadow2d.c
- */
+*/
 void starpu_matrix_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_matrix_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /**
 /**
    Partition a dense Matrix along the y dimension, thus getting
    Partition a dense Matrix along the y dimension, thus getting
    (x,y/\p nparts) matrices. If \p nparts does not divide y, the last
    (x,y/\p nparts) matrices. If \p nparts does not divide y, the last
    submatrix contains the remainder.
    submatrix contains the remainder.
- */
+*/
 void starpu_matrix_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_matrix_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /**
 /**
@@ -377,18 +383,19 @@ void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *ch
 
 
 /** @} */
 /** @} */
 
 
-/** @name Predefined Vector Filter Functions
- * Predefined partitioning functions for vector
- * data. Examples on how to use them are shown in \ref
- * PartitioningData.
- * @{
- */
+/**
+   @name Predefined Vector Filter Functions
+   Predefined partitioning functions for vector
+   data. Examples on how to use them are shown in \ref
+   PartitioningData.
+   @{
+*/
 
 
 /**
 /**
    Return in \p child_interface the \p id th element of the vector
    Return in \p child_interface the \p id th element of the vector
    represented by \p father_interface once partitioned in \p nparts chunks of
    represented by \p father_interface once partitioned in \p nparts chunks of
    equal size.
    equal size.
- */
+*/
 void starpu_vector_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /**
 /**
@@ -411,39 +418,40 @@ void starpu_vector_filter_block_shadow(void *father_interface, void *child_inter
    <c>filter_arg_ptr</c> field must point to an array of \p nparts long
    <c>filter_arg_ptr</c> field must point to an array of \p nparts long
    elements, each of which specifies the number of elements in each chunk
    elements, each of which specifies the number of elements in each chunk
    of the partition.
    of the partition.
- */
+*/
 void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /**
 /**
-   Return in \p child_interface the \p id th element of the vector
-   represented by \p father_interface once partitioned into \p nparts chunks
-   according to the <c>filter_arg_ptr</c> field of \p f. The
-   <c>filter_arg_ptr</c> field must point to an array of \p nparts uint32_t
-   elements, each of which specifies the number of elements in each chunk
-   of the partition.
- */
+  Return in \p child_interface the \p id th element of the vector
+  represented by \p father_interface once partitioned into \p nparts chunks
+  according to the <c>filter_arg_ptr</c> field of \p f. The
+  <c>filter_arg_ptr</c> field must point to an array of \p nparts uint32_t
+  elements, each of which specifies the number of elements in each chunk
+  of the partition.
+*/
 void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /**
 /**
    Return in \p child_interface the \p id th element of the vector
    Return in \p child_interface the \p id th element of the vector
    represented by \p father_interface once partitioned in <c>2</c> chunks of
    represented by \p father_interface once partitioned in <c>2</c> chunks of
    equal size, ignoring nparts. Thus, \p id must be <c>0</c> or <c>1</c>.
    equal size, ignoring nparts. Thus, \p id must be <c>0</c> or <c>1</c>.
- */
+*/
 void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Predefined Block Filter Functions
- * Predefined partitioning functions for block data. Examples on how
- * to use them are shown in \ref PartitioningData. An example is
- * available in \c examples/filters/shadow3d.c
- * @{
- */
+/**
+   @name Predefined Block Filter Functions
+   Predefined partitioning functions for block data. Examples on how
+   to use them are shown in \ref PartitioningData. An example is
+   available in \c examples/filters/shadow3d.c
+   @{
+*/
 
 
 /**
 /**
-   Partition a block along the X dimension, thus getting
-   (x/\p nparts ,y,z) 3D matrices. If \p nparts does not divide x, the last
-   submatrix contains the remainder.
+  Partition a block along the X dimension, thus getting
+  (x/\p nparts ,y,z) 3D matrices. If \p nparts does not divide x, the last
+  submatrix contains the remainder.
  */
  */
 void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
@@ -475,14 +483,14 @@ void starpu_block_filter_vertical_block(void *father_interface, void *child_inte
    <b>IMPORTANT</b>:
    <b>IMPORTANT</b>:
    This can only be used for read-only access, as no coherency is
    This can only be used for read-only access, as no coherency is
    enforced for the shadowed parts.
    enforced for the shadowed parts.
- */
+*/
 void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /**
 /**
    Partition a block along the Z dimension, thus getting
    Partition a block along the Z dimension, thus getting
    (x,y,z/\p nparts) blocks. If \p nparts does not divide z, the last
    (x,y,z/\p nparts) blocks. If \p nparts does not divide z, the last
    submatrix contains the remainder.
    submatrix contains the remainder.
- */
+*/
 void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /**
 /**
@@ -494,7 +502,7 @@ void starpu_block_filter_depth_block(void *father_interface, void *child_interfa
    <b>IMPORTANT</b>:
    <b>IMPORTANT</b>:
    This can only be used for read-only access, as no coherency is
    This can only be used for read-only access, as no coherency is
    enforced for the shadowed parts.
    enforced for the shadowed parts.
- */
+*/
 void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 
 /** @} */
 /** @} */

+ 96 - 79
include/starpu_data_interfaces.h

@@ -19,53 +19,53 @@
 #ifndef __STARPU_DATA_INTERFACES_H__
 #ifndef __STARPU_DATA_INTERFACES_H__
 #define __STARPU_DATA_INTERFACES_H__
 #define __STARPU_DATA_INTERFACES_H__
 
 
-/** @defgroup API_Data_Interfaces Data Interfaces
-
-    @brief Data management is done at a high-level in StarPU: rather than
-    accessing a mere list of contiguous buffers, the tasks may manipulate
-    data that are described by a high-level construct which we call data
-    interface.
-
-    An example of data interface is the "vector" interface which describes
-    a contiguous data array on a spefic memory node. This interface is a
-    simple structure containing the number of elements in the array, the
-    size of the elements, and the address of the array in the appropriate
-    address space (this address may be invalid if there is no valid copy
-    of the array in the memory node). More informations on the data
-    interfaces provided by StarPU are given in \ref API_Data_Interfaces.
-
-    When a piece of data managed by StarPU is used by a task, the task
-    implementation is given a pointer to an interface describing a valid
-    copy of the data that is accessible from the current processing unit.
-
-    Every worker is associated to a memory node which is a logical
-    abstraction of the address space from which the processing unit gets
-    its data. For instance, the memory node associated to the different
-    CPU workers represents main memory (RAM), the memory node associated
-    to a GPU is DRAM embedded on the device. Every memory node is
-    identified by a logical index which is accessible from the
-    function starpu_worker_get_memory_node(). When registering a piece of
-    data to StarPU, the specified memory node indicates where the piece of
-    data initially resides (we also call this memory node the home node of
-    a piece of data).
-
-    In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
-    and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
-    numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
-
-    There are several ways to register a memory region so that it can be
-    managed by StarPU. StarPU provides data interfaces for vectors, 2D
-    matrices, 3D matrices as well as BCSR and CSR sparse matrices.
-
-    Each data interface is provided with a set of field access functions.
-    The ones using a <c>void *</c> parameter aimed to be used in codelet
-    implementations (see for example the code in
-    \ref VectorScalingUsingStarPUAPI).
-
-    Applications can provide their own interface as shown in \ref DefiningANewDataInterface.
-
-    @{
- */
+/**
+   @defgroup API_Data_Interfaces Data Interfaces
+   @brief Data management is done at a high-level in StarPU: rather than
+   accessing a mere list of contiguous buffers, the tasks may manipulate
+   data that are described by a high-level construct which we call data
+   interface.
+
+   An example of data interface is the "vector" interface which describes
+   a contiguous data array on a spefic memory node. This interface is a
+   simple structure containing the number of elements in the array, the
+   size of the elements, and the address of the array in the appropriate
+   address space (this address may be invalid if there is no valid copy
+   of the array in the memory node). More informations on the data
+   interfaces provided by StarPU are given in \ref API_Data_Interfaces.
+
+   When a piece of data managed by StarPU is used by a task, the task
+   implementation is given a pointer to an interface describing a valid
+   copy of the data that is accessible from the current processing unit.
+
+   Every worker is associated to a memory node which is a logical
+   abstraction of the address space from which the processing unit gets
+   its data. For instance, the memory node associated to the different
+   CPU workers represents main memory (RAM), the memory node associated
+   to a GPU is DRAM embedded on the device. Every memory node is
+   identified by a logical index which is accessible from the
+   function starpu_worker_get_memory_node(). When registering a piece of
+   data to StarPU, the specified memory node indicates where the piece of
+   data initially resides (we also call this memory node the home node of
+   a piece of data).
+
+   In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
+   and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
+   numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
+
+   There are several ways to register a memory region so that it can be
+   managed by StarPU. StarPU provides data interfaces for vectors, 2D
+   matrices, 3D matrices as well as BCSR and CSR sparse matrices.
+
+   Each data interface is provided with a set of field access functions.
+   The ones using a <c>void *</c> parameter aimed to be used in codelet
+   implementations (see for example the code in
+   \ref VectorScalingUsingStarPUAPI).
+
+   Applications can provide their own interface as shown in \ref DefiningANewDataInterface.
+
+   @{
+*/
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
@@ -538,9 +538,11 @@ struct starpu_data_interface_ops
 	char *name;
 	char *name;
 };
 };
 
 
-/** @name Basic API
-    @{
-    */
+/**
+   @name Basic API
+   @{
+*/
+
 /**
 /**
    Register a piece of data into the handle located at the
    Register a piece of data into the handle located at the
    \p handleptr address. The \p data_interface buffer contains the initial
    \p handleptr address. The \p data_interface buffer contains the initial
@@ -706,13 +708,16 @@ void starpu_malloc_on_node_set_default_flags(unsigned node, int flags);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Accessing Matrix Data Interfaces
-    @{
- */
+/**
+   @name Accessing Matrix Data Interfaces
+   @{
+*/
 
 
 extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
 extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
 
 
-/** Matrix interface for dense matrices */
+/**
+   Matrix interface for dense matrices
+*/
 struct starpu_matrix_interface
 struct starpu_matrix_interface
 {
 {
 	enum starpu_data_interface_id id; /**< Identifier of the interface */
 	enum starpu_data_interface_id id; /**< Identifier of the interface */
@@ -875,13 +880,16 @@ size_t starpu_matrix_get_allocsize(starpu_data_handle_t handle);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Accessing COO Data Interfaces
-    @{
- */
+/**
+   @name Accessing COO Data Interfaces
+   @{
+*/
 
 
 extern struct starpu_data_interface_ops starpu_interface_coo_ops;
 extern struct starpu_data_interface_ops starpu_interface_coo_ops;
 
 
-/** COO Matrices */
+/**
+   COO Matrices
+*/
 struct starpu_coo_interface
 struct starpu_coo_interface
 {
 {
 	enum starpu_data_interface_id id; /**< identifier of the interface */
 	enum starpu_data_interface_id id; /**< identifier of the interface */
@@ -964,15 +972,18 @@ void starpu_coo_data_register(starpu_data_handle_t *handleptr, int home_node, ui
 
 
 /** @} */
 /** @} */
 
 
-/** @name Block Data Interface
-    @{
- */
+/**
+   @name Block Data Interface
+   @{
+*/
 
 
 extern struct starpu_data_interface_ops starpu_interface_block_ops;
 extern struct starpu_data_interface_ops starpu_interface_block_ops;
 
 
 /* TODO: rename to 3dmatrix? */
 /* TODO: rename to 3dmatrix? */
 /* TODO: add allocsize support */
 /* TODO: add allocsize support */
-/** Block interface for 3D dense blocks */
+/**
+   Block interface for 3D dense blocks
+*/
 struct starpu_block_interface
 struct starpu_block_interface
 {
 {
 	enum starpu_data_interface_id id; /**< identifier of the interface */
 	enum starpu_data_interface_id id; /**< identifier of the interface */
@@ -1115,9 +1126,10 @@ designated by \p interface.
 
 
 /** @} */
 /** @} */
 
 
-/** @name Vector Data Interface
-    @{
- */
+/**
+   @name Vector Data Interface
+   @{
+*/
 
 
 extern struct starpu_data_interface_ops starpu_interface_vector_ops;
 extern struct starpu_data_interface_ops starpu_interface_vector_ops;
 
 
@@ -1241,9 +1253,10 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Variable Data Interface
-    @{
- */
+/**
+   @name Variable Data Interface
+   @{
+*/
 
 
 extern struct starpu_data_interface_ops starpu_interface_variable_ops;
 extern struct starpu_data_interface_ops starpu_interface_variable_ops;
 
 
@@ -1322,9 +1335,10 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Void Data Interface
-    @{
- */
+/**
+   @name Void Data Interface
+   @{
+*/
 
 
 extern struct starpu_data_interface_ops starpu_interface_void_ops;
 extern struct starpu_data_interface_ops starpu_interface_void_ops;
 
 
@@ -1340,8 +1354,9 @@ void starpu_void_data_register(starpu_data_handle_t *handle);
 
 
 /** @} */
 /** @} */
 
 
-/** @name CSR Data Interface
-    @{
+/**
+   @name CSR Data Interface
+   @{
  */
  */
 
 
 extern struct starpu_data_interface_ops starpu_interface_csr_ops;
 extern struct starpu_data_interface_ops starpu_interface_csr_ops;
@@ -1473,9 +1488,10 @@ size_t starpu_csr_get_elemsize(starpu_data_handle_t handle);
 
 
 /** @} */
 /** @} */
 
 
-/** @name BCSR Data Interface
-    @{
- */
+/**
+   @name BCSR Data Interface
+   @{
+*/
 
 
 extern struct starpu_data_interface_ops starpu_interface_bcsr_ops;
 extern struct starpu_data_interface_ops starpu_interface_bcsr_ops;
 
 
@@ -1677,9 +1693,10 @@ size_t starpu_bcsr_get_elemsize(starpu_data_handle_t handle);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Multiformat Data Interface
-    @{
- */
+/**
+   @name Multiformat Data Interface
+   @{
+*/
 
 
 /**
 /**
    Multiformat operations
    Multiformat operations

+ 7 - 4
include/starpu_disk.h

@@ -20,14 +20,17 @@
 #ifndef __STARPU_DISK_H__
 #ifndef __STARPU_DISK_H__
 #define __STARPU_DISK_H__
 #define __STARPU_DISK_H__
 
 
-/** @defgroup API_Out_Of_Core Out Of Core
-    @{
- */
+/**
+   @defgroup API_Out_Of_Core Out Of Core
+   @{
+*/
 
 
 #include <sys/types.h>
 #include <sys/types.h>
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 
-/** Set of functions to manipulate datas on disk. */
+/**
+   Set of functions to manipulate datas on disk.
+*/
 struct starpu_disk_ops
 struct starpu_disk_ops
 {
 {
 	/**
 	/**

+ 7 - 5
include/starpu_driver.h

@@ -18,10 +18,10 @@
 #ifndef __STARPU_DRIVER_H__
 #ifndef __STARPU_DRIVER_H__
 #define __STARPU_DRIVER_H__
 #define __STARPU_DRIVER_H__
 
 
-/** @defgroup API_Running_Drivers Running Drivers
- *
- * @{
- */
+/**
+   @defgroup API_Running_Drivers Running Drivers
+   @{
+*/
 
 
 #include <starpu_config.h>
 #include <starpu_config.h>
 #if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
 #if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
@@ -33,7 +33,9 @@ extern "C"
 {
 {
 #endif
 #endif
 
 
-/** structure for a driver */
+/**
+   structure for a driver
+*/
 struct starpu_driver
 struct starpu_driver
 {
 {
 	/**
 	/**

+ 4 - 4
include/starpu_expert.h

@@ -18,10 +18,10 @@
 #ifndef __STARPU_EXPERT_H__
 #ifndef __STARPU_EXPERT_H__
 #define __STARPU_EXPERT_H__
 #define __STARPU_EXPERT_H__
 
 
-/** @defgroup API_Expert_Mode Expert Mode
- *
- * @{
- */
+/**
+   @defgroup API_Expert_Mode Expert Mode
+   @{
+*/
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 extern "C"
 extern "C"

+ 4 - 4
include/starpu_fxt.h

@@ -21,10 +21,10 @@
 #ifndef __STARPU_FXT_H__
 #ifndef __STARPU_FXT_H__
 #define __STARPU_FXT_H__
 #define __STARPU_FXT_H__
 
 
-/** @defgroup API_FxT_Support FxT Support
- *
- * @{
- */
+/**
+   @defgroup API_FxT_Support FxT Support
+   @{
+*/
 
 
 #include <starpu_perfmodel.h>
 #include <starpu_perfmodel.h>
 
 

+ 4 - 4
include/starpu_hash.h

@@ -19,10 +19,10 @@
 #ifndef __STARPU_HASH_H__
 #ifndef __STARPU_HASH_H__
 #define __STARPU_HASH_H__
 #define __STARPU_HASH_H__
 
 
-/** @ingroup API_Data_Interfaces
- *
- * @{
- */
+/**
+   @ingroup API_Data_Interfaces
+   @{
+*/
 
 
 #include <stdint.h>
 #include <stdint.h>
 #include <stddef.h>
 #include <stddef.h>

+ 4 - 4
include/starpu_mic.h

@@ -19,10 +19,10 @@
 #ifndef __STARPU_MIC_H__
 #ifndef __STARPU_MIC_H__
 #define __STARPU_MIC_H__
 #define __STARPU_MIC_H__
 
 
-/** @defgroup API_MIC_Extensions MIC Extensions
- *
- * @{
- */
+/**
+   @defgroup API_MIC_Extensions MIC Extensions
+   @{
+*/
 
 
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 

+ 4 - 4
include/starpu_mpi_ms.h

@@ -18,10 +18,10 @@
 #ifndef __STARPU_MPI_MS_H__
 #ifndef __STARPU_MPI_MS_H__
 #define __STARPU_MPI_MS_H__
 #define __STARPU_MPI_MS_H__
 
 
-/** @defgroup API_Master_Slave Master Slave Extension
- *
- * @{
- */
+/**
+   @defgroup API_Master_Slave Master Slave Extension
+   @{
+*/
 
 
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 

+ 32 - 27
include/starpu_opencl.h

@@ -19,10 +19,10 @@
 #ifndef __STARPU_OPENCL_H__
 #ifndef __STARPU_OPENCL_H__
 #define __STARPU_OPENCL_H__
 #define __STARPU_OPENCL_H__
 
 
-/** @defgroup API_OpenCL_Extensions OpenCL Extensions
- *
- * @{
- */
+/**
+   @defgroup API_OpenCL_Extensions OpenCL Extensions
+   @{
+*/
 
 
 #include <starpu_config.h>
 #include <starpu_config.h>
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
@@ -51,9 +51,10 @@ struct starpu_opencl_program
 	cl_program programs[STARPU_MAXOPENCLDEVS];
 	cl_program programs[STARPU_MAXOPENCLDEVS];
 };
 };
 
 
-/** @name Writing OpenCL kernels
-    @{
- */
+/**
+   @name Writing OpenCL kernels
+   @{
+*/
 
 
 /**
 /**
    Return the OpenCL context of the device designated by \p devid
    Return the OpenCL context of the device designated by \p devid
@@ -105,17 +106,18 @@ int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Compiling OpenCL kernels
-    Source codes for OpenCL kernels can be stored in a file or in a
-    string. StarPU provides functions to build the program executable for
-    each available OpenCL device as a cl_program object. This program
-    executable can then be loaded within a specific queue as explained in
-    the next section. These are only helpers, Applications can also fill a
-    starpu_opencl_program array by hand for more advanced use (e.g.
-    different programs on the different OpenCL devices, for relocation
-    purpose for instance).
-    @{
- */
+/**
+   @name Compiling OpenCL kernels
+   Source codes for OpenCL kernels can be stored in a file or in a
+   string. StarPU provides functions to build the program executable for
+   each available OpenCL device as a cl_program object. This program
+   executable can then be loaded within a specific queue as explained in
+   the next section. These are only helpers, Applications can also fill a
+   starpu_opencl_program array by hand for more advanced use (e.g.
+   different programs on the different OpenCL devices, for relocation
+   purpose for instance).
+   @{
+*/
 
 
 /**
 /**
    Store the contents of the file \p source_file_name in the buffer
    Store the contents of the file \p source_file_name in the buffer
@@ -182,9 +184,10 @@ int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
 
 
 /** @} */
 /** @} */
 
 
-/** @name Loading OpenCL kernels
-    @{
- */
+/**
+   @name Loading OpenCL kernels
+   @{
+*/
 
 
 /**
 /**
    Create a kernel \p kernel for device \p devid, on its computation
    Create a kernel \p kernel for device \p devid, on its computation
@@ -200,9 +203,10 @@ int starpu_opencl_release_kernel(cl_kernel kernel);
 
 
 /** @} */
 /** @} */
 
 
-/** @name OpenCL Statistics
-    @{
- */
+/**
+   @name OpenCL Statistics
+   @{
+*/
 
 
 /**
 /**
    Collect statistics on a kernel execution.
    Collect statistics on a kernel execution.
@@ -215,9 +219,10 @@ int starpu_opencl_collect_stats(cl_event event);
 
 
 /** @} */
 /** @} */
 
 
-/** @name OpenCL Utilities
-    @{
- */
+/**
+   @name OpenCL Utilities
+   @{
+*/
 
 
 /**
 /**
    Return the error message in English corresponding to \p status, an OpenCL
    Return the error message in English corresponding to \p status, an OpenCL

+ 28 - 21
include/starpu_openmp.h

@@ -18,10 +18,11 @@
 #ifndef __STARPU_OPENMP_H__
 #ifndef __STARPU_OPENMP_H__
 #define __STARPU_OPENMP_H__
 #define __STARPU_OPENMP_H__
 
 
-/** @defgroup API_OpenMP_Runtime_Support OpenMP Runtime Support
-    @brief This section describes the interface provided for implementing OpenMP runtimes on top of StarPU.
-    @{
- */
+/**
+   @defgroup API_OpenMP_Runtime_Support OpenMP Runtime Support
+   @brief This section describes the interface provided for implementing OpenMP runtimes on top of StarPU.
+   @{
+*/
 
 
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 
@@ -213,8 +214,9 @@ extern "C"
 #define __STARPU_OMP_NOTHROW __attribute__((__nothrow__))
 #define __STARPU_OMP_NOTHROW __attribute__((__nothrow__))
 #endif
 #endif
 
 
-/** @name Initialisation
-    @{
+/**
+   @name Initialisation
+   @{
 */
 */
 
 
 /**
 /**
@@ -228,9 +230,10 @@ extern void starpu_omp_shutdown(void) __STARPU_OMP_NOTHROW;
 
 
 /** @} */
 /** @} */
 
 
-/** @name Parallel
-    \anchor ORS_Parallel
-    @{
+/**
+   @name Parallel
+   \anchor ORS_Parallel
+   @{
 */
 */
 
 
 /**
 /**
@@ -266,9 +269,10 @@ extern int starpu_omp_master_inline(void) __STARPU_OMP_NOTHROW;
 
 
 /** @} */
 /** @} */
 
 
-/** @name Synchronization
-    \anchor ORS_Synchronization
-    @{
+/**
+   @name Synchronization
+   \anchor ORS_Synchronization
+   @{
 */
 */
 
 
 /**
 /**
@@ -317,9 +321,10 @@ extern void starpu_omp_critical_inline_end(const char *name) __STARPU_OMP_NOTHRO
 
 
 /** @} */
 /** @} */
 
 
-/** @name Worksharing
-    \anchor ORS_Worksharing
-    @{
+/**
+   @name Worksharing
+   \anchor ORS_Worksharing
+   @{
 */
 */
 
 
 /**
 /**
@@ -542,9 +547,10 @@ extern void starpu_omp_sections_combined(unsigned long long nb_sections, void (*
 
 
 /** @} */
 /** @} */
 
 
-/** @name Task
-    \anchor ORS_Task
-    @{
+/**
+   @name Task
+   \anchor ORS_Task
+   @{
 */
 */
 
 
 /**
 /**
@@ -604,9 +610,10 @@ extern void starpu_omp_taskloop_inline_end(const struct starpu_omp_task_region_a
 
 
 /** @} */
 /** @} */
 
 
-/** @name API
-    \anchor ORS_API
-    @{
+/**
+   @name API
+   \anchor ORS_API
+   @{
 */
 */
 
 
 /**
 /**

+ 264 - 46
include/starpu_perfmodel.h

@@ -21,10 +21,10 @@
 #ifndef __STARPU_PERFMODEL_H__
 #ifndef __STARPU_PERFMODEL_H__
 #define __STARPU_PERFMODEL_H__
 #define __STARPU_PERFMODEL_H__
 
 
-/** @defgroup
- *
- * @{
- */
+/**
+   @defgroup API_Performance_Model Performance Model
+   @{
+*/
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <stdio.h>
 #include <stdio.h>
@@ -39,31 +39,37 @@ struct starpu_data_descr;
 
 
 #define STARPU_NARCH STARPU_ANY_WORKER
 #define STARPU_NARCH STARPU_ANY_WORKER
 
 
+/**
+   todo
+*/
 struct starpu_perfmodel_device
 struct starpu_perfmodel_device
 {
 {
-	enum starpu_worker_archtype type;
-	int devid;
-	int ncores;
+	enum starpu_worker_archtype type; /**< type of the device */
+	int devid;                        /**< identifier of the precise device */
+	int ncores;                       /**< number of execution in parallel, minus 1 */
 };
 };
 
 
+/**
+   todo
+*/
 struct starpu_perfmodel_arch
 struct starpu_perfmodel_arch
 {
 {
-	int ndevices;
-	struct starpu_perfmodel_device *devices;
+	int ndevices;                            /**< number of the devices for the given arch */
+	struct starpu_perfmodel_device *devices; /**< list of the devices for the given arch */
 };
 };
 
 
 
 
 struct starpu_perfmodel_history_entry
 struct starpu_perfmodel_history_entry
 {
 {
-	double mean;
-	double deviation;
-	double sum;
-	double sum2;
-	unsigned nsample;
+	double mean;        /**< mean_n = 1/n sum */
+	double deviation;   /**< n dev_n = sum2 - 1/n (sum)^2 */
+	double sum;         /**< sum of samples (in µs) */
+	double sum2;        /**< sum of samples^2 */
+	unsigned nsample;   /**< number of samples */
 	unsigned nerror;
 	unsigned nerror;
-	uint32_t footprint;
-	size_t size;
-	double flops;
+	uint32_t footprint; /**< data footprint */
+	size_t size;        /**< in bytes */
+	double flops;       /**< Provided by the application */
 
 
 	double duration;
 	double duration;
 	starpu_tag_t tag;
 	starpu_tag_t tag;
@@ -76,30 +82,35 @@ struct starpu_perfmodel_history_list
 	struct starpu_perfmodel_history_entry *entry;
 	struct starpu_perfmodel_history_entry *entry;
 };
 };
 
 
+/**
+   todo
+*/
 struct starpu_perfmodel_regression_model
 struct starpu_perfmodel_regression_model
 {
 {
-	double sumlny;
+	double sumlny;          /**< sum of ln(measured) */
 
 
-	double sumlnx;
-	double sumlnx2;
+	double sumlnx;          /**< sum of ln(size) */
+	double sumlnx2;         /**< sum of ln(size)^2 */
 
 
-	unsigned long minx;
-	unsigned long maxx;
+	unsigned long minx;     /**< minimum size */
+	unsigned long maxx;     /**< maximum size */
 
 
-	double sumlnxlny;
+	double sumlnxlny;       /**< sum of ln(size)*ln(measured) */
 
 
-	double alpha;
-	double beta;
-	unsigned valid;
+	double alpha;           /**< estimated = alpha * size ^ beta */
+	double beta;            /**< estimated = alpha * size ^ beta */
+	unsigned valid;         /**< whether the linear regression model is valid (i.e. enough measures) */
 
 
-	double a, b, c;
-	unsigned nl_valid;
+	double a;               /**< estimated = a size ^b + c */
+	double b;               /**< estimated = a size ^b + c */
+	double c;               /**< estimated = a size ^b + c */
+	unsigned nl_valid;      /**< whether the non-linear regression model is valid (i.e. enough measures) */
 
 
-	unsigned nsample;
+	unsigned nsample;       /**< number of sample values for non-linear regression */
 
 
-	double *coeff;
-	unsigned ncoeff;
-	unsigned multi_valid;
+	double *coeff;          /**< list of computed coefficients for multiple linear regression model */
+	unsigned ncoeff;        /**< number of coefficients for multiple linear regression model */
+	unsigned multi_valid;   /**< whether the multiple linear regression model is valid */
 };
 };
 
 
 struct starpu_perfmodel_history_table;
 struct starpu_perfmodel_history_table;
@@ -109,66 +120,224 @@ struct starpu_perfmodel_history_table;
 typedef double (*starpu_perfmodel_per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 typedef double (*starpu_perfmodel_per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 typedef size_t (*starpu_perfmodel_per_arch_size_base)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 typedef size_t (*starpu_perfmodel_per_arch_size_base)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
 
+/**
+   information about the performance model of a given arch.
+*/
 struct starpu_perfmodel_per_arch
 struct starpu_perfmodel_per_arch
 {
 {
+	/**
+	   Used by ::STARPU_PER_ARCH, must point to functions which take a
+	   task, the target arch and implementation number (as mere
+	   conveniency, since the array is already indexed by these), and
+	   must return a task duration estimation in micro-seconds.
+	*/
 	starpu_perfmodel_per_arch_cost_function cost_function;
 	starpu_perfmodel_per_arch_cost_function cost_function;
+	/**
+	   Same as in structure starpu_perfmodel, but per-arch, in case it
+	   depends on the architecture-specific implementation.
+	*/
 	starpu_perfmodel_per_arch_size_base size_base;
 	starpu_perfmodel_per_arch_size_base size_base;
 
 
+	/**
+	   \private
+	   The history of performance measurements.
+	*/
 	struct starpu_perfmodel_history_table *history;
 	struct starpu_perfmodel_history_table *history;
+	/**
+	   \private
+	   Used by ::STARPU_HISTORY_BASED, ::STARPU_NL_REGRESSION_BASED and
+	   ::STARPU_MULTIPLE_REGRESSION_BASED, records all execution history
+	   measures.
+	*/
 	struct starpu_perfmodel_history_list *list;
 	struct starpu_perfmodel_history_list *list;
+	/**
+	   \private
+	   Used by ::STARPU_REGRESSION_BASED, ::STARPU_NL_REGRESSION_BASED
+	   and ::STARPU_MULTIPLE_REGRESSION_BASED, contains the estimated
+	   factors of the regression.
+	*/
 	struct starpu_perfmodel_regression_model regression;
 	struct starpu_perfmodel_regression_model regression;
 
 
 	char debug_path[256];
 	char debug_path[256];
 };
 };
 
 
+/**
+   todo
+*/
 enum starpu_perfmodel_type
 enum starpu_perfmodel_type
 {
 {
         STARPU_PERFMODEL_INVALID=0,
         STARPU_PERFMODEL_INVALID=0,
-	STARPU_PER_ARCH,
-	STARPU_COMMON,
-	STARPU_HISTORY_BASED,
-	STARPU_REGRESSION_BASED,
-	STARPU_NL_REGRESSION_BASED,
-	STARPU_MULTIPLE_REGRESSION_BASED
+	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
+	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
+	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
+	STARPU_REGRESSION_BASED,          /**< Automatic linear regression-based cost model  (alpha * size ^ beta) */
+	STARPU_NL_REGRESSION_BASED,       /**< Automatic non-linear regression-based cost model (a * size ^ b + c) */
+	STARPU_MULTIPLE_REGRESSION_BASED  /**< Automatic multiple linear regression-based cost model. Application
+					     provides parameters, their combinations and exponents. */
 };
 };
 
 
 struct _starpu_perfmodel_state;
 struct _starpu_perfmodel_state;
 typedef struct _starpu_perfmodel_state* starpu_perfmodel_state_t;
 typedef struct _starpu_perfmodel_state* starpu_perfmodel_state_t;
 
 
+/**
+   Contain all information about a performance model. At least the
+   type and symbol fields have to be filled when defining a performance
+   model for a codelet. For compatibility, make sure to initialize the
+   whole structure to zero, either by using explicit memset, or by
+   letting the compiler implicitly do it in e.g. static storage case. If
+   not provided, other fields have to be zero.
+*/
 struct starpu_perfmodel
 struct starpu_perfmodel
 {
 {
+	/**
+	   type of performance model
+	   <ul>
+	   <li>
+	   ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED,
+	   ::STARPU_NL_REGRESSION_BASED: No other fields needs to be
+	   provided, this is purely history-based.
+	   </li>
+	   <li>
+	   ::STARPU_MULTIPLE_REGRESSION_BASED: Need to provide fields
+	   starpu_perfmodel::nparameters (number of different parameters),
+	   starpu_perfmodel::ncombinations (number of parameters
+	   combinations-tuples) and table starpu_perfmodel::combinations
+	   which defines exponents of the equation. Function cl_perf_func
+	   also needs to define how to extract parameters from the task.
+	   </li>
+	   <li>
+	   ::STARPU_PER_ARCH: either field
+	   starpu_perfmodel::arch_cost_function has to be filled with a
+	   function that returns the cost in micro-seconds on the arch given
+	   as parameter, or field starpu_perfmodel::per_arch has to be filled
+	   with functions which return the cost in micro-seconds.
+	   </li>
+	   <li>
+	   ::STARPU_COMMON: field starpu_perfmodel::cost_function has to be
+	   filled with a function that returns the cost in micro-seconds on a
+	   CPU, timing on other archs will be determined by multiplying by an
+	   arch-specific factor.
+	   </li>
+	   </ul>
+	*/
 	enum starpu_perfmodel_type type;
 	enum starpu_perfmodel_type type;
 
 
+	/**
+	   Used by ::STARPU_COMMON. Take a task and implementation number,
+	   and must return a task duration estimation in micro-seconds.
+	*/
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
+	/**
+	   Used by ::STARPU_COMMON. Take a task, an arch and implementation
+	   number, and must return a task duration estimation in
+	   micro-seconds on that arch.
+	*/
 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
 
 
+	/**
+	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
+	   ::STARPU_NL_REGRESSION_BASED. If not <c>NULL</c>, take a task and
+	   implementation number, and return the size to be used as index to
+	   distinguish histories and as a base for regressions.
+	*/
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
+	/**
+	   Used by ::STARPU_HISTORY_BASED. If not <c>NULL</c>, take a task
+	   and return the footprint to be used as index to distinguish
+	   histories. The default is to use the starpu_task_data_footprint()
+	   function.
+	*/
 	uint32_t (*footprint)(struct starpu_task *);
 	uint32_t (*footprint)(struct starpu_task *);
 
 
+	/**
+	   symbol name for the performance model, which will be used as file
+	   name to store the model. It must be set otherwise the model will
+	   be ignored.
+	*/
 	const char *symbol;
 	const char *symbol;
 
 
+	/**
+	   \private
+	   Whether the performance model is already loaded from the disk.
+	*/
 	unsigned is_loaded;
 	unsigned is_loaded;
+	/**
+	   \private
+	*/
 	unsigned benchmarking;
 	unsigned benchmarking;
+	/**
+	   \private
+	*/
 	unsigned is_init;
 	unsigned is_init;
 
 
 	void (*parameters)(struct starpu_task * task, double *parameters);
 	void (*parameters)(struct starpu_task * task, double *parameters);
+	/**
+	   \private
+	   Names of parameters used for multiple linear regression models (M,
+	   N, K)
+	*/
 	const char **parameters_names;
 	const char **parameters_names;
+	/**
+	   \private
+	   Number of parameters used for multiple linear regression models
+	*/
 	unsigned nparameters;
 	unsigned nparameters;
+	/**
+	   \private
+	   Table of combinations of parameters (and the exponents) used for
+	   multiple linear regression models
+	*/
 	unsigned **combinations;
 	unsigned **combinations;
+	/**
+	   \private
+	   Number of combination of parameters used for multiple linear
+	   regression models
+	*/
 	unsigned ncombinations;
 	unsigned ncombinations;
-
+	/**
+	   \private
+	*/
 	starpu_perfmodel_state_t state;
 	starpu_perfmodel_state_t state;
 };
 };
 
 
 void starpu_perfmodel_init(struct starpu_perfmodel *model);
 void starpu_perfmodel_init(struct starpu_perfmodel *model);
+
+/**
+   Load the performance model found in the file named \p filename. \p model has to be
+   completely zero, and will be filled with the information stored in the given file.
+*/
 int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *model);
 int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *model);
+
+/**
+   Load a given performance model. \p model has to be
+   completely zero, and will be filled with the information stored in
+   <c>$STARPU_HOME/.starpu</c>. The function is intended to be used by
+   external tools that want to read the performance model files.
+*/
+
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
+
+/**
+   Unload \p model which has been previously loaded
+   through the function starpu_perfmodel_load_symbol()
+*/
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
+
 void starpu_perfmodel_get_model_path(const char *symbol, char *path, size_t maxlen);
 void starpu_perfmodel_get_model_path(const char *symbol, char *path, size_t maxlen);
 
 
+/**
+   Free internal memory used for sampling directory
+   management. It should only be called by an application which is not
+   calling starpu_shutdown() as this function already calls it. See for
+   example <c>tools/starpu_perfmodel_display.c</c>.
+*/
 void starpu_perfmodel_free_sampling_directories(void);
 void starpu_perfmodel_free_sampling_directories(void);
 
 
+/**
+   Return the architecture type of the worker \p workerid.
+*/
 struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id);
 struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id);
+
 int starpu_perfmodel_get_narch_combs();
 int starpu_perfmodel_get_narch_combs();
 int starpu_perfmodel_arch_comb_add(int ndevices, struct starpu_perfmodel_device* devices);
 int starpu_perfmodel_arch_comb_add(int ndevices, struct starpu_perfmodel_device* devices);
 int starpu_perfmodel_arch_comb_get(int ndevices, struct starpu_perfmodel_device *devices);
 int starpu_perfmodel_arch_comb_get(int ndevices, struct starpu_perfmodel_device *devices);
@@ -180,39 +349,88 @@ struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_devices(struct
 int starpu_perfmodel_set_per_devices_cost_function(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_cost_function func, ...);
 int starpu_perfmodel_set_per_devices_cost_function(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_cost_function func, ...);
 int starpu_perfmodel_set_per_devices_size_base(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_size_base func, ...);
 int starpu_perfmodel_set_per_devices_size_base(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_size_base func, ...);
 
 
+/**
+   Return the path to the debugging information for the performance model.
+*/
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
+
 char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
 char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
+
+/**
+   Return the architecture name for \p arch
+*/
 void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl);
 void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl);
 
 
+/**
+   Return the estimated time of a task with the given model and the given footprint.
+*/
 double starpu_perfmodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint);
 double starpu_perfmodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint);
+
+/**
+   If starpu_init() is not used, starpu_perfmodel_initialize() should be used called calling starpu_perfmodel_* functions.
+*/
 void starpu_perfmodel_initialize(void);
 void starpu_perfmodel_initialize(void);
+
+/**
+   Print a list of all performance models on \p output
+*/
 int starpu_perfmodel_list(FILE *output);
 int starpu_perfmodel_list(FILE *output);
+
 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t footprint, FILE *output);
 int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t footprint, FILE *output);
 
 
 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model);
 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model);
 
 
+/**
+   Feed the performance model model with an explicit
+   measurement measured (in µs), in addition to measurements done by StarPU
+   itself. This can be useful when the application already has an
+   existing set of measurements done in good conditions, that StarPU
+   could benefit from instead of doing on-line measurements. An example
+   of use can be seen in \ref PerformanceModelExample.
+*/
 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
+
+/**
+   Print the directory name storing performance models on \p output
+*/
 void starpu_perfmodel_directory(FILE *output);
 void starpu_perfmodel_directory(FILE *output);
 
 
+/**
+   Print a matrix of bus bandwidths on \p f.
+*/
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_bandwidth(FILE *f);
+
+/**
+   Print the affinity devices on \p f.
+*/
 void starpu_bus_print_affinity(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
+
+/**
+   Print on \p f the name of the files containing the matrix of bus bandwidths, the affinity devices and the latency.
+*/
 void starpu_bus_print_filenames(FILE *f);
 void starpu_bus_print_filenames(FILE *f);
 
 
+/**
+   Return the bandwidth of data transfer between two memory nodes
+*/
 double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
 double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
+
+/**
+   Return the latency of data transfer between two memory nodes
+*/
 double starpu_transfer_latency(unsigned src_node, unsigned dst_node);
 double starpu_transfer_latency(unsigned src_node, unsigned dst_node);
-double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size);
 
 
-extern struct starpu_perfmodel starpu_perfmodel_nop;
+/**
+   Return the estimated time to transfer a given size between two memory nodes.
+*/
+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size);
 
 
 /**
 /**
-   Display statistics about the current data handles registered
-   within StarPU. StarPU must have been configured with the configure
-   option \ref enable-memory-stats "--enable-memory-stats" (see \ref
-   MemoryFeedback).
+   Performance model which just always return 1µs.
 */
 */
-void starpu_data_display_memory_stats();
+extern struct starpu_perfmodel starpu_perfmodel_nop;
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 125 - 4
include/starpu_profiling.h

@@ -19,10 +19,10 @@
 #ifndef __STARPU_PROFILING_H__
 #ifndef __STARPU_PROFILING_H__
 #define __STARPU_PROFILING_H__
 #define __STARPU_PROFILING_H__
 
 
-/** @defgroup
- *
- * @{
- */
+/**
+   @defgroup API_Profiling Profiling
+   @{
+*/
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <errno.h>
 #include <errno.h>
@@ -33,48 +33,89 @@ extern "C"
 {
 {
 #endif
 #endif
 
 
+/**
+   Used when calling the function starpu_profiling_status_set() to disable profiling.
+*/
 #define STARPU_PROFILING_DISABLE	0
 #define STARPU_PROFILING_DISABLE	0
+/**
+   Used when calling the function starpu_profiling_status_set() to enable profiling.
+*/
 #define STARPU_PROFILING_ENABLE		1
 #define STARPU_PROFILING_ENABLE		1
 
 
+/**
+   Information about the execution of a task. It is accessible from
+   the field starpu_task::profiling_info if profiling was enabled.
+ */
 struct starpu_profiling_task_info
 struct starpu_profiling_task_info
 {
 {
+	/** Date of task submission (relative to the initialization of StarPU). */
 	struct timespec submit_time;
 	struct timespec submit_time;
 
 
+	/** Time when the task was submitted to the scheduler. */
 	struct timespec push_start_time;
 	struct timespec push_start_time;
+	/** Time when the scheduler finished with the task submission. */
 	struct timespec push_end_time;
 	struct timespec push_end_time;
+	/** Time when the scheduler started to be requested for a task, and eventually gave that task. */
 	struct timespec pop_start_time;
 	struct timespec pop_start_time;
+	/** Time when the scheduler finished providing the task for execution. */
 	struct timespec pop_end_time;
 	struct timespec pop_end_time;
 
 
+	/** Time when the worker started fetching input data. */
 	struct timespec acquire_data_start_time;
 	struct timespec acquire_data_start_time;
+	/** Time when the worker finished fetching input data. */
 	struct timespec acquire_data_end_time;
 	struct timespec acquire_data_end_time;
 
 
+	/** Date of task execution beginning (relative to the initialization of StarPU). */
 	struct timespec start_time;
 	struct timespec start_time;
+	/** Date of task execution termination (relative to the initialization of StarPU). */
 	struct timespec end_time;
 	struct timespec end_time;
 
 
+	/** Time when the worker started releasing data. */
 	struct timespec release_data_start_time;
 	struct timespec release_data_start_time;
+	/** Time when the worker finished releasing data. */
 	struct timespec release_data_end_time;
 	struct timespec release_data_end_time;
 
 
+	/** Time when the worker started the application callback for the task. */
 	struct timespec callback_start_time;
 	struct timespec callback_start_time;
+	/** Time when the worker finished the application callback for the task. */
 	struct timespec callback_end_time;
 	struct timespec callback_end_time;
 
 
 	/* TODO add expected length, expected start/end ? */
 	/* TODO add expected length, expected start/end ? */
+
+	/** Identifier of the worker which has executed the task. */
 	int workerid;
 	int workerid;
 
 
+	/** Number of cycles used by the task, only available in the MoviSim */
 	uint64_t used_cycles;
 	uint64_t used_cycles;
+	/** Number of cycles stalled within the task, only available in the MoviSim */
 	uint64_t stall_cycles;
 	uint64_t stall_cycles;
+	/** Energy consumed by the task, in Joules */
 	double energy_consumed;
 	double energy_consumed;
 };
 };
 
 
+/**
+   Profiling information associated to a worker. The timing is
+   provided since the previous call to
+   starpu_profiling_worker_get_info()
+*/
 struct starpu_profiling_worker_info
 struct starpu_profiling_worker_info
 {
 {
+	/** Starting date for the reported profiling measurements. */
 	struct timespec start_time;
 	struct timespec start_time;
+	/** Duration of the profiling measurement interval. */
 	struct timespec total_time;
 	struct timespec total_time;
+	/** Time spent by the worker to execute tasks during the profiling measurement interval. */
 	struct timespec executing_time;
 	struct timespec executing_time;
+	/** Time spent idling by the worker during the profiling measurement interval. */
 	struct timespec sleeping_time;
 	struct timespec sleeping_time;
+	/** Number of tasks executed by the worker during the profiling measurement interval. */
 	int executed_tasks;
 	int executed_tasks;
 
 
+	/** Number of cycles used by the worker, only available in the MoviSim */
 	uint64_t used_cycles;
 	uint64_t used_cycles;
+	/** Number of cycles stalled within the worker, only available in the MoviSim */
 	uint64_t stall_cycles;
 	uint64_t stall_cycles;
+	/** Energy consumed by the worker, in Joules */
 	double energy_consumed;
 	double energy_consumed;
 
 
 	double flops;
 	double flops;
@@ -82,15 +123,43 @@ struct starpu_profiling_worker_info
 
 
 struct starpu_profiling_bus_info
 struct starpu_profiling_bus_info
 {
 {
+	/** Time of bus profiling startup. */
 	struct timespec start_time;
 	struct timespec start_time;
+	/** Total time of bus profiling. */
 	struct timespec total_time;
 	struct timespec total_time;
+	/** Number of bytes transferred during profiling. */
 	int long long transferred_bytes;
 	int long long transferred_bytes;
+	/** Number of transfers during profiling. */
 	int transfer_count;
 	int transfer_count;
 };
 };
 
 
+/**
+   Reset performance counters and enable profiling if the
+   environment variable \ref STARPU_PROFILING is set to a positive value.
+*/
 void starpu_profiling_init(void);
 void starpu_profiling_init(void);
+
+/**
+   Set the ID used for profiling trace filename. Has to be called before starpu_init().
+*/
 void starpu_profiling_set_id(int new_id);
 void starpu_profiling_set_id(int new_id);
+
+/**
+   Set the profiling status. Profiling is activated
+   by passing \ref STARPU_PROFILING_ENABLE in \p status. Passing
+   \ref STARPU_PROFILING_DISABLE disables profiling. Calling this function
+   resets all profiling measurements. When profiling is enabled, the
+   field starpu_task::profiling_info points to a valid structure
+   starpu_profiling_task_info containing information about the execution
+   of the task. Negative return values indicate an error, otherwise the
+   previous status is returned.
+*/
 int starpu_profiling_status_set(int status);
 int starpu_profiling_status_set(int status);
+
+/**
+   Return the current profiling status or a negative value in case
+   there was an error.
+*/
 int starpu_profiling_status_get(void);
 int starpu_profiling_status_get(void);
 
 
 #ifdef BUILDING_STARPU
 #ifdef BUILDING_STARPU
@@ -107,17 +176,43 @@ extern int _starpu_profiling;
 #endif
 #endif
 #endif
 #endif
 
 
+/**
+   Get the profiling info associated to the worker identified by
+   \p workerid, and reset the profiling measurements. If the argument \p
+   worker_info is <c>NULL</c>, only reset the counters associated to worker
+   \p workerid. Upon successful completion, this function returns 0.
+   Otherwise, a negative value is returned.
+*/
 int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info);
 int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info);
 
 
+/**
+   Return the number of buses in the machine
+*/
 int starpu_bus_get_count(void);
 int starpu_bus_get_count(void);
+
+/**
+   Return the identifier of the bus between \p src and \p dst
+*/
 int starpu_bus_get_id(int src, int dst);
 int starpu_bus_get_id(int src, int dst);
+
+/**
+   Return the source point of bus \p busid
+*/
 int starpu_bus_get_src(int busid);
 int starpu_bus_get_src(int busid);
+
+/**
+   Return the destination point of bus \p busid
+*/
 int starpu_bus_get_dst(int busid);
 int starpu_bus_get_dst(int busid);
 void starpu_bus_set_direct(int busid, int direct);
 void starpu_bus_set_direct(int busid, int direct);
 int starpu_bus_get_direct(int busid);
 int starpu_bus_get_direct(int busid);
 void starpu_bus_set_ngpus(int busid, int ngpus);
 void starpu_bus_set_ngpus(int busid, int ngpus);
 int starpu_bus_get_ngpus(int busid);
 int starpu_bus_get_ngpus(int busid);
 
 
+/**
+   See _starpu_profiling_bus_helper_display_summary in src/profiling/profiling_helpers.c for a usage example.
+   Note that calling starpu_bus_get_profiling_info() resets the counters to zero.
+*/
 int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info);
 int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info);
 
 
 /* Some helper functions to manipulate profiling API output */
 /* Some helper functions to manipulate profiling API output */
@@ -177,12 +272,38 @@ static __starpu_inline void starpu_timespec_sub(const struct timespec *a,
 #define starpu_timespec_cmp(a, b, CMP)                          \
 #define starpu_timespec_cmp(a, b, CMP)                          \
 	(((a)->tv_sec == (b)->tv_sec) ? ((a)->tv_nsec CMP (b)->tv_nsec) : ((a)->tv_sec CMP (b)->tv_sec))
 	(((a)->tv_sec == (b)->tv_sec) ? ((a)->tv_nsec CMP (b)->tv_nsec) : ((a)->tv_sec CMP (b)->tv_sec))
 
 
+/**
+   Return the time elapsed between \p start and \p end in microseconds.
+*/
 double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end);
 double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end);
+
+/**
+   Convert the given timespec \p ts into microseconds
+*/
 double starpu_timing_timespec_to_us(struct timespec *ts);
 double starpu_timing_timespec_to_us(struct timespec *ts);
 
 
+/**
+   Display statistics about the bus on \c stderr. if the environment
+   variable \ref STARPU_BUS_STATS is defined. The function is called
+   automatically by starpu_shutdown().
+*/
 void starpu_profiling_bus_helper_display_summary(void);
 void starpu_profiling_bus_helper_display_summary(void);
+
+/**
+   Display statistic about the workers on \c stderr if the
+   environment variable \ref STARPU_WORKER_STATS is defined. The function is
+   called automatically by starpu_shutdown().
+*/
 void starpu_profiling_worker_helper_display_summary(void);
 void starpu_profiling_worker_helper_display_summary(void);
 
 
+/**
+   Display statistics about the current data handles registered
+   within StarPU. StarPU must have been configured with the configure
+   option \ref enable-memory-stats "--enable-memory-stats" (see \ref
+   MemoryFeedback).
+*/
+void starpu_data_display_memory_stats();
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 3 - 3
include/starpu_rand.h

@@ -19,9 +19,9 @@
 #ifndef __STARPU_RAND_H__
 #ifndef __STARPU_RAND_H__
 #define __STARPU_RAND_H__
 #define __STARPU_RAND_H__
 
 
-/** @defgroup
- *
- * @{
+/**
+   @defgroup API_Random_Functions Random Functions
+   @{
  */
  */
 
 
 #include <stdlib.h>
 #include <stdlib.h>

+ 17 - 4
include/starpu_scc.h

@@ -19,19 +19,32 @@
 #ifndef __STARPU_SCC_H__
 #ifndef __STARPU_SCC_H__
 #define __STARPU_SCC_H__
 #define __STARPU_SCC_H__
 
 
-/** @defgroup
- *
- * @{
- */
+/**
+   @defgroup API_SCC_Extensions SCC Extensions
+   @{
+*/
 
 
 #include <starpu_config.h>
 #include <starpu_config.h>
 
 
 #ifdef STARPU_USE_SCC
 #ifdef STARPU_USE_SCC
 
 
+/**
+   Type for SCC function symbols
+*/
 typedef void *starpu_scc_func_symbol_t;
 typedef void *starpu_scc_func_symbol_t;
 
 
+/**
+   Initiate a lookup on each SCC device to find the adress of the
+   function named \p func_name, store them in the global array kernels
+   and return the index in the array through \p symbol.
+*/
 int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
 int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
 
 
+/**
+   If success, return the pointer to the function defined by \p symbol on
+   the device linked to the called device. This can for instance be used
+   in a starpu_scc_func_symbol_t implementation.
+*/
 starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t symbol);
 starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t symbol);
 
 
 #endif /* STARPU_USE_SCC */
 #endif /* STARPU_USE_SCC */

+ 506 - 10
include/starpu_sched_component.h

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2017                                     Arthur Chevalier
  * Copyright (C) 2017                                     Arthur Chevalier
  * Copyright (C) 2013,2014,2017                           Inria
  * Copyright (C) 2013,2014,2017                           Inria
- * Copyright (C) 2014,2015,2017,2019                           CNRS
+ * Copyright (C) 2014,2015,2017,2019                      CNRS
  * Copyright (C) 2014-2019                                Université de Bordeaux
  * Copyright (C) 2014-2019                                Université de Bordeaux
  * Copyright (C) 2013                                     Simon Archipoff
  * Copyright (C) 2013                                     Simon Archipoff
  *
  *
@@ -21,10 +21,10 @@
 #ifndef __STARPU_SCHED_COMPONENT_H__
 #ifndef __STARPU_SCHED_COMPONENT_H__
 #define __STARPU_SCHED_COMPONENT_H__
 #define __STARPU_SCHED_COMPONENT_H__
 
 
-/** @defgroup
- *
- * @{
- */
+/**
+   @defgroup API_Modularized_Scheduler Modularized Scheduler Interface
+   @{
+*/
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
@@ -37,106 +37,379 @@ extern "C"
 {
 {
 #endif
 #endif
 
 
+/**
+   flags for starpu_sched_component::properties
+*/
 enum starpu_sched_component_properties
 enum starpu_sched_component_properties
 {
 {
+	/** indicate that all workers have the same starpu_worker_archtype */
 	STARPU_SCHED_COMPONENT_HOMOGENEOUS = (1<<0),
 	STARPU_SCHED_COMPONENT_HOMOGENEOUS = (1<<0),
+	/** indicate that all workers have the same memory component */
 	STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE = (1<<1)
 	STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE = (1<<1)
 };
 };
 
 
+/**
+   indicate if component is homogeneous
+*/
 #define STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component) ((component)->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS)
 #define STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component) ((component)->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS)
+
+/**
+   indicate if all workers have the same memory component
+*/
 #define STARPU_SCHED_COMPONENT_IS_SINGLE_MEMORY_NODE(component) ((component)->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
 #define STARPU_SCHED_COMPONENT_IS_SINGLE_MEMORY_NODE(component) ((component)->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
 
 
+/**
+   Structure for a scheduler module.  A scheduler is a
+   tree-like structure of them, some parts of scheduler can be shared by
+   several contexes to perform some local optimisations, so, for all
+   components, a list of parent is defined by \c sched_ctx_id. They
+   embed there specialised method in a pseudo object-style, so calls are
+   like <c>component->push_task(component,task)</c>
+*/
 struct starpu_sched_component
 struct starpu_sched_component
 {
 {
+	/** The tree containing the component*/
 	struct starpu_sched_tree *tree;
 	struct starpu_sched_tree *tree;
+	/** set of underlying workers */
 	struct starpu_bitmap *workers;
 	struct starpu_bitmap *workers;
+	/**
+	   subset of starpu_sched_component::workers that is currently available in the context
+	   The push method should take this value into account, it is set with:
+	   component->workers UNION tree->workers UNION
+	   component->child[i]->workers_in_ctx iff exist x such as component->children[i]->parents[x] == component
+	*/
 	struct starpu_bitmap *workers_in_ctx;
 	struct starpu_bitmap *workers_in_ctx;
+	/** private data */
 	void *data;
 	void *data;
 	char *name;
 	char *name;
+	/** number of compoments's children */
 	unsigned nchildren;
 	unsigned nchildren;
+	/** vector of component's children */
 	struct starpu_sched_component **children;
 	struct starpu_sched_component **children;
+	/** number of component's parents */
 	unsigned nparents;
 	unsigned nparents;
+	/** vector of component's parents */
 	struct starpu_sched_component **parents;
 	struct starpu_sched_component **parents;
 
 
+	/** add a child to component */
 	void (*add_child)(struct starpu_sched_component *component, struct starpu_sched_component *child);
 	void (*add_child)(struct starpu_sched_component *component, struct starpu_sched_component *child);
+	/** remove a child from component */
 	void (*remove_child)(struct starpu_sched_component *component, struct starpu_sched_component *child);
 	void (*remove_child)(struct starpu_sched_component *component, struct starpu_sched_component *child);
 	void (*add_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent);
 	void (*add_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent);
 	void (*remove_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent);
 	void (*remove_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent);
 
 
+	/**
+	   push a task in the scheduler module. this function is called to
+	   push a task on component subtree, this can either perform a
+	   recursive call on a child or store the task in the component,
+	   then it will be returned by a further pull_task call.
+	   the caller must ensure that component is able to execute task.
+	   This method must either return 0 if it the task was properly stored or
+	   passed over to a child component, or return a value different from 0 if the
+	   task could not be consumed (e.g. the queue is full).
+	*/
 	int (*push_task)(struct starpu_sched_component *, struct starpu_task *);
 	int (*push_task)(struct starpu_sched_component *, struct starpu_task *);
+
+	/**
+	   pop a task from the scheduler module. this function is called by workers to get a task from their
+	   parents. this function should first return a locally stored task
+	   or perform a recursive call on the parents.
+	   the task returned by this function should be executable by the caller
+	*/
 	struct starpu_task *(*pull_task)(struct starpu_sched_component *from, struct starpu_sched_component *to);
 	struct starpu_task *(*pull_task)(struct starpu_sched_component *from, struct starpu_sched_component *to);
 
 
+	/**
+	   This function is called by a component which implements a queue,
+	   allowing it to signify to its parents that an empty slot is
+	   available in its queue. This should return 1 if some tasks could be pushed
+	   The basic implementation of this function
+	   is a recursive call to its parents, the user has to specify a
+	   personally-made function to catch those calls.
+	*/
 	int (*can_push)(struct starpu_sched_component *from, struct starpu_sched_component *to);
 	int (*can_push)(struct starpu_sched_component *from, struct starpu_sched_component *to);
+
+	/**
+	   This function allow a component to wake up a worker. It is
+	   currently called by component which implements a queue, to
+	   signify to its children that a task have been pushed in its local
+	   queue, and is available to be popped by a worker, for example.
+	   This should return 1 if some some container or worker could (or will) pull
+	   some tasks.
+	   The basic implementation of this function is a recursive call to
+	   its children, until at least one worker have been woken up.
+	*/
 	int (*can_pull)(struct starpu_sched_component *component);
 	int (*can_pull)(struct starpu_sched_component *component);
 
 
 	int (*notify)(struct starpu_sched_component* component, int message_ID, void* arg);
 	int (*notify)(struct starpu_sched_component* component, int message_ID, void* arg);
 
 
+	/**
+	   heuristic to compute load of scheduler module. Basically the number of tasks divided by the sum
+	   of relatives speedup of workers available in context.
+	   estimated_load(component) = sum(estimated_load(component_children)) + nb_local_tasks / average(relative_speedup(underlying_worker))
+	*/
 	double (*estimated_load)(struct starpu_sched_component *component);
 	double (*estimated_load)(struct starpu_sched_component *component);
+	/**
+	   return the time when a worker will enter in starvation. This function is relevant only if the task->predicted
+	   member has been set.
+	*/
 	double (*estimated_end)(struct starpu_sched_component *component);
 	double (*estimated_end)(struct starpu_sched_component *component);
 
 
+	/**
+	   called by starpu_sched_component_destroy. Should free data allocated during creation
+	*/
 	void (*deinit_data)(struct starpu_sched_component *component);
 	void (*deinit_data)(struct starpu_sched_component *component);
+
+	/**
+	   this function is called for each component when workers are added or removed from a context
+	*/
 	void (*notify_change_workers)(struct starpu_sched_component *component);
 	void (*notify_change_workers)(struct starpu_sched_component *component);
 	int properties;
 	int properties;
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
+	/**
+	   the hwloc object associated to scheduler module. points to the
+	   part of topology that is binded to this component, eg: a numa
+	   node for a ws component that would balance load between
+	   underlying sockets
+	*/
 	hwloc_obj_t obj;
 	hwloc_obj_t obj;
 #else
 #else
 	void *obj;
 	void *obj;
 #endif
 #endif
 };
 };
 
 
+/**
+   The actual scheduler
+*/
 struct starpu_sched_tree
 struct starpu_sched_tree
 {
 {
+	/**
+	   entry module of the scheduler
+	*/
 	struct starpu_sched_component *root;
 	struct starpu_sched_component *root;
+	/**
+	   set of workers available in this context, this value is used to mask workers in modules
+	*/
 	struct starpu_bitmap *workers;
 	struct starpu_bitmap *workers;
+	/**
+	   context id of the scheduler
+	*/
 	unsigned sched_ctx_id;
 	unsigned sched_ctx_id;
+	/**
+	   lock used to protect the scheduler, it is taken in read mode pushing a task and in write mode for adding or
+	   removing workers
+	*/
 	starpu_pthread_mutex_t lock;
 	starpu_pthread_mutex_t lock;
 };
 };
 
 
+void starpu_initialize_prio_center_policy(unsigned sched_ctx_id);
+
+/**
+   @name Scheduling Tree API
+   @{
+*/
+
+/**
+   create a empty initialized starpu_sched_tree
+*/
 struct starpu_sched_tree *starpu_sched_tree_create(unsigned sched_ctx_id) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_tree *starpu_sched_tree_create(unsigned sched_ctx_id) STARPU_ATTRIBUTE_MALLOC;
+/**
+   destroy tree and free all non shared component in it.
+*/
 void starpu_sched_tree_destroy(struct starpu_sched_tree *tree);
 void starpu_sched_tree_destroy(struct starpu_sched_tree *tree);
 struct starpu_sched_tree *starpu_sched_tree_get(unsigned sched_ctx_id);
 struct starpu_sched_tree *starpu_sched_tree_get(unsigned sched_ctx_id);
+/**
+   recursively set all starpu_sched_component::workers, do not take into account shared parts (except workers).
+*/
 void starpu_sched_tree_update_workers(struct starpu_sched_tree *t);
 void starpu_sched_tree_update_workers(struct starpu_sched_tree *t);
+/**
+   recursively set all starpu_sched_component::workers_in_ctx, do not take into account shared parts (except workers)
+*/
 void starpu_sched_tree_update_workers_in_ctx(struct starpu_sched_tree *t);
 void starpu_sched_tree_update_workers_in_ctx(struct starpu_sched_tree *t);
+/**
+   compatibility with starpu_sched_policy interface
+*/
 int starpu_sched_tree_push_task(struct starpu_task *task);
 int starpu_sched_tree_push_task(struct starpu_task *task);
-int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task);
+/**
+   compatibility with starpu_sched_policy interface
+*/
 struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx);
 struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx);
+
+/**
+   Push a task to a component. This is a helper for <c>component->push_task(component, task)</c> plus tracing.
+*/
+int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task);
+
+/**
+   Pull a task from a component. This is a helper for <c>component->pull_task(component)</c> plus tracing.
+*/
 struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to);
 struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to);
+
 struct starpu_task* starpu_sched_component_pump_to(struct starpu_sched_component *component, struct starpu_sched_component *to, int* success);
 struct starpu_task* starpu_sched_component_pump_to(struct starpu_sched_component *component, struct starpu_sched_component *to, int* success);
 struct starpu_task* starpu_sched_component_pump_downstream(struct starpu_sched_component *component, int* success);
 struct starpu_task* starpu_sched_component_pump_downstream(struct starpu_sched_component *component, int* success);
 int starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component);
 int starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component);
-
+/**
+   compatibility with starpu_sched_policy interface
+*/
 void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
+/**
+   compatibility with starpu_sched_policy interface
+*/
 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 
 
+/**
+   Attach component \p child to parent \p parent. Some component may accept only one child, others accept several (e.g. MCT)
+*/
+void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child);
+
+/** @} */
+
+/**
+   @name Generic Scheduling Component API
+   @{
+*/
+
 typedef struct starpu_sched_component * (*starpu_sched_component_create_t)(struct starpu_sched_tree *tree, void *data);
 typedef struct starpu_sched_component * (*starpu_sched_component_create_t)(struct starpu_sched_tree *tree, void *data);
+
+/**
+   allocate and initialize component field with defaults values :
+   .pop_task make recursive call on father
+   .estimated_load compute relative speedup and tasks in sub tree
+   .estimated_end return the minimum of recursive call on children
+   .add_child is starpu_sched_component_add_child
+   .remove_child is starpu_sched_component_remove_child
+   .notify_change_workers does nothing
+   .deinit_data does nothing
+*/
 struct starpu_sched_component *starpu_sched_component_create(struct starpu_sched_tree *tree, const char *name) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_create(struct starpu_sched_tree *tree, const char *name) STARPU_ATTRIBUTE_MALLOC;
-void starpu_sched_component_add_child(struct starpu_sched_component* component, struct starpu_sched_component * child);
+
+/**
+   free data allocated by starpu_sched_component_create and call component->deinit_data(component)
+   set to <c>NULL</c> the member starpu_sched_component::fathers[sched_ctx_id] of all child if its equal to \p component
+*/
+
 void starpu_sched_component_destroy(struct starpu_sched_component *component);
 void starpu_sched_component_destroy(struct starpu_sched_component *component);
+/**
+   recursively destroy non shared parts of a \p component 's tree
+*/
 void starpu_sched_component_destroy_rec(struct starpu_sched_component *component);
 void starpu_sched_component_destroy_rec(struct starpu_sched_component *component);
+
+void starpu_sched_component_add_child(struct starpu_sched_component* component, struct starpu_sched_component * child);
+
+/**
+   return true iff \p component can execute \p task, this function take into account the workers available in the scheduling context
+*/
 int starpu_sched_component_can_execute_task(struct starpu_sched_component *component, struct starpu_task *task);
 int starpu_sched_component_can_execute_task(struct starpu_sched_component *component, struct starpu_task *task);
+
+/**
+   return a non <c>NULL</c> value if \p component can execute \p task.
+   write the execution prediction length for the best implementation of the best worker available and write this at \p length address.
+   this result is more relevant if starpu_sched_component::is_homogeneous is non <c>NULL</c>.
+   if a worker need to be calibrated for an implementation, nan is set to \p length.
+*/
 int STARPU_WARN_UNUSED_RESULT starpu_sched_component_execute_preds(struct starpu_sched_component *component, struct starpu_task *task, double *length);
 int STARPU_WARN_UNUSED_RESULT starpu_sched_component_execute_preds(struct starpu_sched_component *component, struct starpu_task *task, double *length);
+
+/**
+   return the average time to transfer \p task data to underlying \p component workers.
+*/
 double starpu_sched_component_transfer_length(struct starpu_sched_component *component, struct starpu_task *task);
 double starpu_sched_component_transfer_length(struct starpu_sched_component *component, struct starpu_task *task);
+
 void starpu_sched_component_prefetch_on_node(struct starpu_sched_component *component, struct starpu_task *task);
 void starpu_sched_component_prefetch_on_node(struct starpu_sched_component *component, struct starpu_task *task);
 
 
-void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child);
+/** @} */
+
+/**
+   @name Worker Component API
+   @{
+*/
 
 
+/**
+   return the struct starpu_sched_component corresponding to \p workerid. Undefined if \p workerid is not a valid workerid
+*/
 struct starpu_sched_component *starpu_sched_component_worker_get(unsigned sched_ctx, int workerid);
 struct starpu_sched_component *starpu_sched_component_worker_get(unsigned sched_ctx, int workerid);
 struct starpu_sched_component *starpu_sched_component_worker_new(unsigned sched_ctx, int workerid);
 struct starpu_sched_component *starpu_sched_component_worker_new(unsigned sched_ctx, int workerid);
+
+/**
+   Create a combined worker that pushes tasks in parallel to workers \p workers (size \p nworkers).
+*/
 struct starpu_sched_component *starpu_sched_component_parallel_worker_create(struct starpu_sched_tree *tree, unsigned nworkers, unsigned *workers);
 struct starpu_sched_component *starpu_sched_component_parallel_worker_create(struct starpu_sched_tree *tree, unsigned nworkers, unsigned *workers);
+
+/**
+   return the workerid of \p worker_component, undefined if starpu_sched_component_is_worker(worker_component) == 0
+*/
 int starpu_sched_component_worker_get_workerid(struct starpu_sched_component *worker_component);
 int starpu_sched_component_worker_get_workerid(struct starpu_sched_component *worker_component);
+
+/**
+   return true iff \p component is a worker component
+*/
 int starpu_sched_component_is_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_worker(struct starpu_sched_component *component);
+
+/**
+   return true iff \p component is a simple worker component
+*/
 int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component);
+
+/**
+   return true iff \p component is a combined worker component
+*/
 int starpu_sched_component_is_combined_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_combined_worker(struct starpu_sched_component *component);
+
+/**
+   compatibility with starpu_sched_policy interface
+   update predictions for workers
+*/
 void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
 void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
+
+/**
+   compatibility with starpu_sched_policy interface
+*/
 void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
 void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
 
 
+/** @} */
+
+/**
+   @name Flow-control Fifo Component API
+   @{
+*/
+
+/**
+   default function for the can_push component method, just call can_push of parents until one of them returns non-zero
+*/
 int starpu_sched_component_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to);
 int starpu_sched_component_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to);
+
+/**
+default function for the can_pull component method, just call can_pull of children until one of them returns non-zero
+*/
 int starpu_sched_component_can_pull(struct starpu_sched_component * component);
 int starpu_sched_component_can_pull(struct starpu_sched_component * component);
+
+/**
+   function for the can_pull component method, call can_pull of all children
+*/
 int starpu_sched_component_can_pull_all(struct starpu_sched_component * component);
 int starpu_sched_component_can_pull_all(struct starpu_sched_component * component);
+
+/**
+   default function for the estimated_load component method, just sum up the loads
+   of the children of the component.
+*/
 double starpu_sched_component_estimated_load(struct starpu_sched_component * component);
 double starpu_sched_component_estimated_load(struct starpu_sched_component * component);
+
+/**
+   function that can be used for the estimated_end component method, compute the minimum completion time of the children.
+*/
 double starpu_sched_component_estimated_end_min(struct starpu_sched_component * component);
 double starpu_sched_component_estimated_end_min(struct starpu_sched_component * component);
+
+/**
+   function that can be used for the estimated_end component method, compute
+   the minimum completion time of the children, and add to it an estimation of how
+   existing queued work, plus the exp_len work, can be completed. This is typically
+   used instead of starpu_sched_component_estimated_end_min when the component
+   contains a queue of tasks, which thus needs to be added to the estimations.
+*/
 double starpu_sched_component_estimated_end_min_add(struct starpu_sched_component * component, double exp_len);
 double starpu_sched_component_estimated_end_min_add(struct starpu_sched_component * component, double exp_len);
+
+/**
+   default function for the estimated_end component method, compute the average completion time of the children.
+*/
 double starpu_sched_component_estimated_end_average(struct starpu_sched_component * component);
 double starpu_sched_component_estimated_end_average(struct starpu_sched_component * component);
 
 
 struct starpu_sched_component_fifo_data
 struct starpu_sched_component_fifo_data
@@ -145,9 +418,25 @@ struct starpu_sched_component_fifo_data
 	double exp_len_threshold;
 	double exp_len_threshold;
 };
 };
 
 
+/**
+   Return a struct starpu_sched_component with a fifo. A stable sort is performed according to tasks priorities.
+   A push_task call on this component does not perform recursive calls, underlying components will have to call pop_task to get it.
+   starpu_sched_component::estimated_end function compute the estimated length by dividing the sequential length by the number of underlying workers.
+*/
 struct starpu_sched_component *starpu_sched_component_fifo_create(struct starpu_sched_tree *tree, struct starpu_sched_component_fifo_data *fifo_data) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_fifo_create(struct starpu_sched_tree *tree, struct starpu_sched_component_fifo_data *fifo_data) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   return true iff \p component is a fifo component
+*/
 int starpu_sched_component_is_fifo(struct starpu_sched_component *component);
 int starpu_sched_component_is_fifo(struct starpu_sched_component *component);
 
 
+/** @} */
+
+/**
+   @name Flow-control Prio Component API
+   @{
+*/
+
 struct starpu_sched_component_prio_data
 struct starpu_sched_component_prio_data
 {
 {
 	unsigned ntasks_threshold;
 	unsigned ntasks_threshold;
@@ -156,19 +445,70 @@ struct starpu_sched_component_prio_data
 struct starpu_sched_component *starpu_sched_component_prio_create(struct starpu_sched_tree *tree, struct starpu_sched_component_prio_data *prio_data) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_prio_create(struct starpu_sched_tree *tree, struct starpu_sched_component_prio_data *prio_data) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_prio(struct starpu_sched_component *component);
 int starpu_sched_component_is_prio(struct starpu_sched_component *component);
 
 
+/** @} */
+
+/**
+   @name Resource-mapping Work-Stealing Component API
+   @{
+*/
+
+/**
+   return a component that perform a work stealing scheduling. Tasks are pushed in a round robin way. estimated_end return the average of expected length of fifos, starting at the average of the expected_end of his children. When a worker have to steal a task, it steal a task in a round robin way, and get the last pushed task of the higher priority.
+*/
 struct starpu_sched_component *starpu_sched_component_work_stealing_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_work_stealing_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   return true iff \p component is a work stealing component
+ */
 int starpu_sched_component_is_work_stealing(struct starpu_sched_component *component);
 int starpu_sched_component_is_work_stealing(struct starpu_sched_component *component);
+
+/**
+   undefined if there is no work stealing component in the scheduler. If any, \p task is pushed in a default way if the caller is the application, and in the caller's fifo if its a worker.
+*/
 int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task);
 int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task);
 
 
+/** @} */
+
+/**
+   @name Resource-mapping Random Component API
+   @{
+*/
+
+/**
+   create a component that perform a random scheduling
+*/
 struct starpu_sched_component *starpu_sched_component_random_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_random_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   return true iff \p component is a random component
+*/
 int starpu_sched_component_is_random(struct starpu_sched_component *);
 int starpu_sched_component_is_random(struct starpu_sched_component *);
 
 
+/** @} */
+
+/**
+   @name Resource-mapping Eager Component API
+   @{
+*/
+
 struct starpu_sched_component *starpu_sched_component_eager_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_eager_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_eager(struct starpu_sched_component *);
 int starpu_sched_component_is_eager(struct starpu_sched_component *);
 
 
+/**
+   @name Resource-mapping Eager-Calibration Component API
+   @{
+*/
+
 struct starpu_sched_component *starpu_sched_component_eager_calibration_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_eager_calibration_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_eager_calibration(struct starpu_sched_component *);
 int starpu_sched_component_is_eager_calibration(struct starpu_sched_component *);
 
 
+/** @} */
+
+/**
+   @name Resource-mapping MCT Component API
+   @{
+*/
+
 struct starpu_sched_component_mct_data
 struct starpu_sched_component_mct_data
 {
 {
 	double alpha;
 	double alpha;
@@ -176,14 +516,48 @@ struct starpu_sched_component_mct_data
 	double _gamma;
 	double _gamma;
 	double idle_power;
 	double idle_power;
 };
 };
+
+/**
+   create a component with mct_data paremeters. the mct component doesnt
+   do anything but pushing tasks on no_perf_model_component and
+   calibrating_component
+*/
 struct starpu_sched_component *starpu_sched_component_mct_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_mct_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data) STARPU_ATTRIBUTE_MALLOC;
+
 int starpu_sched_component_is_mct(struct starpu_sched_component *component);
 int starpu_sched_component_is_mct(struct starpu_sched_component *component);
 
 
+/** @} */
+
+/**
+   @name Resource-mapping Heft Component API
+   @{
+*/
+
 struct starpu_sched_component *starpu_sched_component_heft_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_heft_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_heft(struct starpu_sched_component *component);
 int starpu_sched_component_is_heft(struct starpu_sched_component *component);
 
 
+/** @} */
+
+/**
+   @name Special-purpose Best_Implementation Component API
+   @{
+*/
+
+/**
+   Select the implementation that offer the shortest computation length for the first worker that can execute the task.
+   Or an implementation that need to be calibrated.
+   Also set starpu_task::predicted and starpu_task::predicted_transfer for memory component of the first suitable workerid.
+   If starpu_sched_component::push method is called and starpu_sched_component::nchild > 1 the result is undefined.
+*/
 struct starpu_sched_component *starpu_sched_component_best_implementation_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_best_implementation_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 
 
+/** @} */
+
+/**
+   @name Special-purpose Perfmodel_Select Component API
+   @{
+*/
+
 struct starpu_sched_component_perfmodel_select_data
 struct starpu_sched_component_perfmodel_select_data
 {
 {
 	struct starpu_sched_component *calibrator_component;
 	struct starpu_sched_component *calibrator_component;
@@ -193,46 +567,168 @@ struct starpu_sched_component_perfmodel_select_data
 struct starpu_sched_component *starpu_sched_component_perfmodel_select_create(struct starpu_sched_tree *tree, struct starpu_sched_component_perfmodel_select_data *perfmodel_select_data) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_perfmodel_select_create(struct starpu_sched_tree *tree, struct starpu_sched_component_perfmodel_select_data *perfmodel_select_data) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_perfmodel_select(struct starpu_sched_component *component);
 int starpu_sched_component_is_perfmodel_select(struct starpu_sched_component *component);
 
 
-void starpu_initialize_prio_center_policy(unsigned sched_ctx_id);
+/** @} */
 
 
+/**
+   @name Recipe Component API
+   @{
+*/
+
+/**
+   parameters for starpu_sched_component_composed_component_create
+*/
 struct starpu_sched_component_composed_recipe;
 struct starpu_sched_component_composed_recipe;
+
+/**
+   return an empty recipe for a composed component, it should not be used without modification
+*/
 struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create(void) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create(void) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   return a recipe to build a composed component with a \p create_component
+*/
 struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create_singleton(struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create_singleton(struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   add \p create_component under all previous components in recipe
+*/
 void starpu_sched_component_composed_recipe_add(struct starpu_sched_component_composed_recipe *recipe, struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg);
 void starpu_sched_component_composed_recipe_add(struct starpu_sched_component_composed_recipe *recipe, struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg);
+
+/**
+   destroy composed_sched_component, this should be done after starpu_sched_component_composed_component_create was called
+*/
 void starpu_sched_component_composed_recipe_destroy(struct starpu_sched_component_composed_recipe *);
 void starpu_sched_component_composed_recipe_destroy(struct starpu_sched_component_composed_recipe *);
+
+/**
+   create a component that behave as all component of recipe where linked. Except that you cant use starpu_sched_component_is_foo function
+   if recipe contain a single create_foo arg_foo pair, create_foo(arg_foo) is returned instead of a composed component
+*/
 struct starpu_sched_component *starpu_sched_component_composed_component_create(struct starpu_sched_tree *tree, struct starpu_sched_component_composed_recipe *recipe) STARPU_ATTRIBUTE_MALLOC;
 struct starpu_sched_component *starpu_sched_component_composed_component_create(struct starpu_sched_tree *tree, struct starpu_sched_component_composed_recipe *recipe) STARPU_ATTRIBUTE_MALLOC;
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
+/**
+   Define how build a scheduler according to topology. Each level (except for hwloc_machine_composed_sched_component) can be <c>NULL</c>, then
+   the level is just skipped. Bugs everywhere, do not rely on.
+*/
 struct starpu_sched_component_specs
 struct starpu_sched_component_specs
 {
 {
+	/**
+	   the composed component to put on the top of the scheduler
+	   this member must not be <c>NULL</c> as it is the root of the topology
+	*/
 	struct starpu_sched_component_composed_recipe *hwloc_machine_composed_sched_component;
 	struct starpu_sched_component_composed_recipe *hwloc_machine_composed_sched_component;
+	/**
+	   the composed component to put for each memory component
+	*/
 	struct starpu_sched_component_composed_recipe *hwloc_component_composed_sched_component;
 	struct starpu_sched_component_composed_recipe *hwloc_component_composed_sched_component;
+	/**
+	   the composed component to put for each socket
+	*/
 	struct starpu_sched_component_composed_recipe *hwloc_socket_composed_sched_component;
 	struct starpu_sched_component_composed_recipe *hwloc_socket_composed_sched_component;
+	/**
+	   the composed component to put for each cache
+	*/
 	struct starpu_sched_component_composed_recipe *hwloc_cache_composed_sched_component;
 	struct starpu_sched_component_composed_recipe *hwloc_cache_composed_sched_component;
 
 
+	/**
+	   a function that return a starpu_sched_component_composed_recipe to put on top of a worker of type \p archtype.
+	   <c>NULL</c> is a valid return value, then no component will be added on top
+	*/
 	struct starpu_sched_component_composed_recipe *(*worker_composed_sched_component)(enum starpu_worker_archtype archtype);
 	struct starpu_sched_component_composed_recipe *(*worker_composed_sched_component)(enum starpu_worker_archtype archtype);
+	/**
+	   this flag is a dirty hack because of the poor expressivity of this interface. As example, if you want to build
+	   a heft component with a fifo component per numa component, and you also have GPUs, if this flag is set, GPUs will share those fifos.
+	   If this flag is not set, a new fifo will be built for each of them (if they have the same starpu_perf_arch and the same
+	   numa component it will be shared. it indicates if heterogenous workers should be brothers or cousins, as example, if a gpu and a cpu should share or not there numa node
+	*/
 	int mix_heterogeneous_workers;
 	int mix_heterogeneous_workers;
 };
 };
 
 
+
+/**
+   build a scheduler for \p sched_ctx_id according to \p s and the hwloc topology of the machine.
+*/
 struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_ctx_id, struct starpu_sched_component_specs s);
 struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_ctx_id, struct starpu_sched_component_specs s);
 #endif /* STARPU_HAVE_HWLOC */
 #endif /* STARPU_HAVE_HWLOC */
 
 
+/**
+   @name Basic API
+   @{
+*/
+
 #define STARPU_SCHED_SIMPLE_DECIDE_MASK		(3<<0)
 #define STARPU_SCHED_SIMPLE_DECIDE_MASK		(3<<0)
+
+/**
+   Request to create downstream queues per worker, i.e. the scheduling decision-making component will choose exactly which workers tasks should got to.
+*/
 #define STARPU_SCHED_SIMPLE_DECIDE_WORKERS	(1<<0)
 #define STARPU_SCHED_SIMPLE_DECIDE_WORKERS	(1<<0)
+
+/**
+   Request to create downstream queues per memory nodes, i.e. the scheduling decision-making component will choose which memory node tasks will go to.
+*/
 #define STARPU_SCHED_SIMPLE_DECIDE_MEMNODES	(2<<0)
 #define STARPU_SCHED_SIMPLE_DECIDE_MEMNODES	(2<<0)
+
+/**
+   Request to create downstream queues per computation arch, i.e. the scheduling decision-making component will choose whether tasks go to CPUs, or CUDA, or OpenCL, etc.
+*/
 #define STARPU_SCHED_SIMPLE_DECIDE_ARCHS	(3<<0)
 #define STARPU_SCHED_SIMPLE_DECIDE_ARCHS	(3<<0)
 
 
+/**
+   Request to add a perfmodel selector above the scheduling decision-making component. That way, only tasks with a calibrated performance model will be given to the component, other tasks will go to an eager branch that will distributed tasks so that their performance models will get calibrated.
+   In other words, this is needed when using a component which needs performance models for tasks.
+*/
 #define STARPU_SCHED_SIMPLE_PERFMODEL		(1<<4)
 #define STARPU_SCHED_SIMPLE_PERFMODEL		(1<<4)
+
+/**
+   Request that a component be added just above workers, that chooses the best task implementation.
+*/
 #define STARPU_SCHED_SIMPLE_IMPL		(1<<5)
 #define STARPU_SCHED_SIMPLE_IMPL		(1<<5)
+
+/**
+   Request to create a fifo above the scheduling decision-making component, otherwise tasks will be pushed directly to the component.
+
+   This is useful to store tasks if there is a fifo below which limits the number of tasks to be scheduld in advance. The scheduling decision-making component can also store tasks itself, in which case this flag is not useful.
+*/
 #define STARPU_SCHED_SIMPLE_FIFO_ABOVE		(1<<6)
 #define STARPU_SCHED_SIMPLE_FIFO_ABOVE		(1<<6)
+
+/**
+   Request that the fifo above be sorted by priorities
+*/
 #define STARPU_SCHED_SIMPLE_FIFO_ABOVE_PRIO	(1<<7)
 #define STARPU_SCHED_SIMPLE_FIFO_ABOVE_PRIO	(1<<7)
+
+/**
+   Request to create fifos below the scheduling decision-making component, otherwise tasks will be pulled directly from workers.
+
+   This is useful to be able to schedule a (tunable) small number of tasks in advance only.
+*/
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW		(1<<8)
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW		(1<<8)
+
+/**
+   Request that the fifos below be sorted by priorities
+*/
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW_PRIO	(1<<9)
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW_PRIO	(1<<9)
+
+/**
+   Request that work between workers using the same fifo below be distributed using a work stealing component.
+*/
 #define STARPU_SCHED_SIMPLE_WS_BELOW		(1<<10)
 #define STARPU_SCHED_SIMPLE_WS_BELOW		(1<<10)
+
+/**
+   Request to not only choose between simple workers, but also choose between combined workers.
+*/
 #define STARPU_SCHED_SIMPLE_COMBINED_WORKERS	(1<<11)
 #define STARPU_SCHED_SIMPLE_COMBINED_WORKERS	(1<<11)
 
 
+/**
+   Create a simple modular scheduler tree around a scheduling decision-making
+   component \p component. The details of what should be built around \p component
+   is described by \p flags. The different STARPU_SCHED_SIMPL_DECIDE_* flags are
+   mutually exclusive. \p data is passed to the \p create_decision_component
+   function when creating the decision component.
+*/
 void starpu_sched_component_initialize_simple_scheduler(starpu_sched_component_create_t create_decision_component, void *data, unsigned flags, unsigned sched_ctx_id);
 void starpu_sched_component_initialize_simple_scheduler(starpu_sched_component_create_t create_decision_component, void *data, unsigned flags, unsigned sched_ctx_id);
 
 
+/** @} */
+
 #define STARPU_COMPONENT_MUTEX_LOCK(m) \
 #define STARPU_COMPONENT_MUTEX_LOCK(m) \
 do \
 do \
 { \
 { \