Quellcode durchsuchen

move documentation to independant doxygen files into public .h files

Nathalie Furmento vor 6 Jahren
Ursprung
Commit
6862bbddbf

+ 0 - 3
doc/doxygen/Makefile.am

@@ -116,8 +116,6 @@ chapters =	\
 	chapters/api/mic_extensions.doxy \
 	chapters/api/scc_extensions.doxy \
 	chapters/api/parallel_tasks.doxy \
-	chapters/api/performance_model.doxy \
-	chapters/api/profiling.doxy \
 	chapters/api/scheduling_contexts.doxy \
 	chapters/api/scheduling_policy.doxy \
 	chapters/api/standard_memory_library.doxy \
@@ -131,7 +129,6 @@ chapters =	\
 	chapters/api/toolbox.doxy \
 	chapters/api/sc_hypervisor/sc_hypervisor.doxy \
 	chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy \
-	chapters/api/modularized_scheduler.doxy \
 	chapters/api/interoperability.doxy
 
 images = 	\

+ 0 - 560
doc/doxygen/chapters/api/modularized_scheduler.doxy

@@ -1,560 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2013,2014                                Inria
- * Copyright (C) 2013-2018                                CNRS
- * Copyright (C) 2009-2011,2014,2015,2017,2018-2019       Université de Bordeaux
- * Copyright (C) 2013                                     Simon Archipoff
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \defgroup API_Modularized_Scheduler Modularized Scheduler Interface
-
-\enum starpu_sched_component_properties
-\ingroup API_Modularized_Scheduler
-flags for starpu_sched_component::properties
-\var starpu_sched_component_properties::STARPU_SCHED_COMPONENT_HOMOGENEOUS
-     indicate that all workers have the same starpu_worker_archtype
-\var starpu_sched_component_properties::STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE
-     indicate that all workers have the same memory component
-
-\def STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS
-\ingroup API_Modularized_Scheduler
-indicate if component is homogeneous
-\def STARPU_SCHED_COMPONENT_IS_SINGLE_MEMORY_NODE
-\ingroup API_Modularized_Scheduler
-indicate if all workers have the same memory component
-
-\struct starpu_sched_component
-\ingroup API_Modularized_Scheduler
-This structure represent a scheduler module.  A scheduler is a
-tree-like structure of them, some parts of scheduler can be shared by
-several contexes to perform some local optimisations, so, for all
-components, a list of parent is defined by \c sched_ctx_id. They
-embed there specialised method in a pseudo object-style, so calls are
-like <c>component->push_task(component,task)</c>
-
-\var struct starpu_sched_tree *starpu_sched_component::tree
-     The tree containing the component
-\var struct starpu_bitmap *starpu_sched_component::workers
-     this member contain the set of underlying workers
-\var starpu_sched_component::workers_in_ctx
-     this member contain the subset of starpu_sched_component::workers that is currently available in the context
-     The push method should take this member into account.
-     this member is set with :
-     component->workers UNION tree->workers UNION
-     component->child[i]->workers_in_ctx iff exist x such as component->children[i]->parents[x] == component
-\var void *starpu_sched_component::data
-     private data
-\var int starpu_sched_component::nchildren
-     the number of compoments's children
-\var struct starpu_sched_component **starpu_sched_component::children
-     the vector of component's children
-\var int starpu_sched_component::nparents
-     the numbers of component's parents
-\var struct starpu_sched_component **starpu_sched_component::parents
-     the vector of component's parents
-
-\var void(*starpu_sched_component::add_child)(struct starpu_sched_component *component, struct starpu_sched_component *child)
-     add a child to component
-\var void(*starpu_sched_component::remove_child)(struct starpu_sched_component *component, struct starpu_sched_component *child)
-     remove a child from component
-\var void(*starpu_sched_component::add_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent)
-     todo
-\var void(*starpu_sched_component::remove_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent)
-     todo
-
-\var int (*starpu_sched_component::push_task)(struct starpu_sched_component *, struct starpu_task *)
-     push a task in the scheduler module. this function is called to
-     push a task on component subtree, this can either perform a
-     recursive call on a child or store the task in the component,
-     then it will be returned by a further pull_task call.
-     the caller must ensure that component is able to execute task.
-     This method must either return 0 if it the task was properly stored or
-     passed over to a child component, or return a value different from 0 if the
-     task could not be consumed (e.g. the queue is full).
-\var struct starpu_task * (*starpu_sched_component::pull_task)(struct starpu_sched_component *component, struct starpu_sched_component *to)
-     pop a task from the scheduler module. this function is called by workers to get a task from their
-     parents. this function should first return a locally stored task
-     or perform a recursive call on the parents.
-     the task returned by this function should be executable by the caller
-
-\var int (*starpu_sched_component::can_push)(struct starpu_sched_component *component, struct starpu_sched_component *to)
-     This function is called by a component which implements a queue,
-     allowing it to signify to its parents that an empty slot is
-     available in its queue. This should return 1 if some tasks could be pushed
-     The basic implementation of this function
-     is a recursive call to its parents, the user has to specify a
-     personally-made function to catch those calls.
-\var int (*starpu_sched_component::can_pull)(struct starpu_sched_component *component)
-     This function allow a component to wake up a worker. It is
-     currently called by component which implements a queue, to
-     signify to its children that a task have been pushed in its local
-     queue, and is available to be popped by a worker, for example.
-     This should return 1 if some some container or worker could (or will) pull
-     some tasks.
-     The basic implementation of this function is a recursive call to
-     its children, until at least one worker have been woken up.
-
-\var double (*starpu_sched_component::estimated_load)(struct starpu_sched_component *component)
-	is an heuristic to compute load of scheduler module. Basically the number of tasks divided by the sum
-	of relatives speedup of workers available in context.
-	estimated_load(component) = sum(estimated_load(component_children)) + nb_local_tasks / average(relative_speedup(underlying_worker))
-\var starpu_sched_component::estimated_end
-	return the time when a worker will enter in starvation. This function is relevant only if the task->predicted
-	member has been set.
-
-\var void (*starpu_sched_component::deinit_data)(struct starpu_sched_component *component)
-	called by starpu_sched_component_destroy. Should free data allocated during creation
-\var void (*starpu_sched_component::notify_change_workers)(struct starpu_sched_component *component)
-	this function is called for each component when workers are added or removed from a context
-\var int starpu_sched_component::properties
-	todo
-\var hwloc_obj_t starpu_sched_component::obj
-	the hwloc object associated to scheduler module. points to the
-	part of topology that is binded to this component, eg: a numa
-	node for a ws component that would balance load between
-	underlying sockets
-
-\struct starpu_sched_tree
-\ingroup API_Modularized_Scheduler
-The actual scheduler
-\var struct starpu_sched_component *starpu_sched_tree::root
-	this is the entry module of the scheduler
-\var struct starpu_bitmap *starpu_sched_tree::workers
-	this is the set of workers available in this context, this value is used to mask workers in modules
-\var unsigned starpu_sched_tree::sched_ctx_id
-	the context id of the scheduler
-\var starpu_pthread_mutex_t starpu_sched_tree::lock
-	this lock is used to protect the scheduler, it is taken in
-	read mode pushing a task and in write mode for adding or
-	removing workers
-
-@name Scheduling Tree API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_tree *starpu_sched_tree_create(unsigned sched_ctx_id)
-\ingroup API_Modularized_Scheduler
-	 create a empty initialized starpu_sched_tree
-
-\fn void starpu_sched_tree_destroy(struct starpu_sched_tree *tree)
-\ingroup API_Modularized_Scheduler
-	 destroy tree and free all non shared component in it.
-
-\fn void starpu_sched_tree_update_workers(struct starpu_sched_tree *t)
-\ingroup API_Modularized_Scheduler
-	 recursively set all starpu_sched_component::workers, do not take into account shared parts (except workers).
-
-\fn void starpu_sched_tree_update_workers_in_ctx(struct starpu_sched_tree *t)
-\ingroup API_Modularized_Scheduler
-	 recursively set all starpu_sched_component::workers_in_ctx, do not take into account shared parts (except workers)
-
-\fn int starpu_sched_tree_push_task(struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-\fn struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-\fn void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-\fn void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-\fn void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child)
-\ingroup API_Modularized_Scheduler
-	 Attaches component \p child to parent \p parent. Some component may accept only one child, others accept several (e.g. MCT)
-
-@name Generic Scheduling Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_create(struct starpu_sched_tree *tree, const char *name)
-\ingroup API_Modularized_Scheduler
-	 allocate and initialize component field with defaults values :
-	.pop_task make recursive call on father
-	.estimated_load compute relative speedup and tasks in sub tree
-	.estimated_end return the minimum of recursive call on children
-	.add_child is starpu_sched_component_add_child
-	.remove_child is starpu_sched_component_remove_child
-	.notify_change_workers does nothing
-	.deinit_data does nothing
-
-\fn void starpu_sched_component_destroy(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 free data allocated by starpu_sched_component_create and call component->deinit_data(component)
-	 set to <c>NULL</c> the member starpu_sched_component::fathers[sched_ctx_id] of all child if its equal to \p component
-
-\fn void starpu_sched_component_destroy_rec(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 recursively destroy non shared parts of a \p component 's tree
-
-\fn int starpu_sched_component_can_execute_task(struct starpu_sched_component *component, struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component can execute \p task, this function take into account the workers available in the scheduling context
-
-\fn int starpu_sched_component_execute_preds(struct starpu_sched_component *component, struct starpu_task *task, double *length)
-\ingroup API_Modularized_Scheduler
-	 return a non <c>NULL</c> value if \p component can execute \p task.
-	 write the execution prediction length for the best implementation of the best worker available and write this at \p length address.
-	 this result is more relevant if starpu_sched_component::is_homogeneous is non <c>NULL</c>.
-	 if a worker need to be calibrated for an implementation, nan is set to \p length.
-
-\fn double starpu_sched_component_transfer_length(struct starpu_sched_component *component, struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-	 return the average time to transfer \p task data to underlying \p component workers.
-
-@name Worker Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_worker_get(unsigned sched_ctx, int workerid)
-\ingroup API_Modularized_Scheduler
-	 return the struct starpu_sched_component corresponding to \p workerid. Undefined if \p workerid is not a valid workerid
-
-\fn struct starpu_sched_component *starpu_sched_component_parallel_worker_create(struct starpu_sched_tree *tree, unsigned nworkers, unsigned *workers)
-\ingroup API_Modularized_Scheduler
-	 Create a combined worker that pushes tasks in parallel to workers \p workers (size \p nworkers).
-
-\fn int starpu_sched_component_worker_get_workerid(struct starpu_sched_component *worker_component)
-\ingroup API_Modularized_Scheduler
-	 return the workerid of \p worker_component, undefined if starpu_sched_component_is_worker(worker_component) == 0
-
-\fn int starpu_sched_component_is_worker(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a worker component
-
-\fn int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a simple worker component
-
-\fn int starpu_sched_component_is_combined_worker(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a combined worker component
-
-\fn void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-	 update predictions for workers
-
-\fn void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
-\ingroup API_Modularized_Scheduler
-	 compatibility with starpu_sched_policy interface
-
-@name Flow-control Fifo Component API
-\ingroup API_Modularized_Scheduler
-
-\fn int starpu_sched_component_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to)
-\ingroup API_Modularized_Scheduler
-default function for the can_push component method, just calls can_push of parents until one of them returns non-zero
-
-\fn int starpu_sched_component_can_pull(struct starpu_sched_component * component)
-\ingroup API_Modularized_Scheduler
-default function for the can_pull component method, just calls can_pull of children until one of them returns non-zero
-
-\fn int starpu_sched_component_can_pull_all(struct starpu_sched_component * component)
-\ingroup API_Modularized_Scheduler
-function for the can_pull component method, calls can_pull of all children
-
-\fn double starpu_sched_component_estimated_load(struct starpu_sched_component * component);
-\ingroup API_Modularized_Scheduler
-default function for the estimated_load component method, just sums up the loads
-of the children of the component.
-
-\fn double starpu_sched_component_estimated_end_min(struct starpu_sched_component * component);
-\ingroup API_Modularized_Scheduler
-function that can be used for the estimated_end component method, which just computes the minimum completion time of the children.
-
-\fn double starpu_sched_component_estimated_end_min_add(struct starpu_sched_component * component, double exp_len);
-\ingroup API_Modularized_Scheduler
-function that can be used for the estimated_end component method, which computes
-the minimum completion time of the children, and adds to it an estimation of how
-existing queued work, plus the exp_len work, can be completed. This is typically
-used instead of starpu_sched_component_estimated_end_min when the component
-contains a queue of tasks, which thus needs to be added to the estimations.
-
-\fn double starpu_sched_component_estimated_end_average(struct starpu_sched_component * component);
-\ingroup API_Modularized_Scheduler
-default function for the estimated_end component method, which just computes the average completion time of the children.
-
-
-\struct starpu_sched_component_fifo_data
-\ingroup API_Modularized_Scheduler
-\var unsigned starpu_sched_component_fifo_data::ntasks_threshold
-todo
-\var double starpu_sched_component_fifo_data::exp_len_threshold
-todo
-
-\fn struct starpu_sched_component *starpu_sched_component_fifo_create(struct starpu_sched_tree *tree, struct starpu_sched_component_fifo_data *fifo_data)
-\ingroup API_Modularized_Scheduler
-	 Return a struct starpu_sched_component with a fifo. A stable sort is performed according to tasks priorities.
-	 A push_task call on this component does not perform recursive calls, underlying components will have to call pop_task to get it.
-	 starpu_sched_component::estimated_end function compute the estimated length by dividing the sequential length by the number of underlying workers.
-
-\fn int starpu_sched_component_is_fifo(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a fifo component
-
-@name Flow-control Prio Component API
-\ingroup API_Modularized_Scheduler
-
-\struct starpu_sched_component_prio_data
-\ingroup API_Modularized_Scheduler
-\var unsigned starpu_sched_component_prio_data::ntasks_threshold
-todo
-\var double starpu_sched_component_prio_data::exp_len_threshold
-todo
-
-\fn struct starpu_sched_component *starpu_sched_component_prio_create(struct starpu_sched_tree *tree, struct starpu_sched_component_prio_data *prio_data)
-\ingroup API_Modularized_Scheduler
-todo
-
-\fn int starpu_sched_component_is_prio(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Resource-mapping Work-Stealing Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_work_stealing_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-	 return a component that perform a work stealing scheduling. Tasks are pushed in a round robin way. estimated_end return the average of expected length of fifos, starting at the average of the expected_end of his children. When a worker have to steal a task, it steal a task in a round robin way, and get the last pushed task of the higher priority.
-
-\fn int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-	 undefined if there is no work stealing component in the scheduler. If any, \p task is pushed in a default way if the caller is the application, and in the caller's fifo if its a worker.
-
-\fn int starpu_sched_component_is_work_stealing(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a work stealing component
-
-@name Resource-mapping Random Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_random_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-	 create a component that perform a random scheduling
-
-\fn int starpu_sched_component_is_random(struct starpu_sched_component *)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a random component
-
-@name Resource-mapping Eager Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_eager_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-todo
-
-\fn int starpu_sched_component_is_eager(struct starpu_sched_component *)
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Resource-mapping Eager-Calibration Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_eager_calibration_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-todo
-
-\fn int starpu_sched_component_is_eager_calibration(struct starpu_sched_component *)
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Resource-mapping MCT Component API
-\ingroup API_Modularized_Scheduler
-
-\struct starpu_sched_component_mct_data
-\ingroup API_Modularized_Scheduler
-\var double starpu_sched_component_mct_data::alpha
-todo
-\var double starpu_sched_component_mct_data::beta
-todo
-\var double starpu_sched_component_mct_data::_gamma
-todo
-\var double starpu_sched_component_mct_data::idle_power
-todo
-
-\fn struct starpu_sched_component *starpu_sched_component_mct_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data)
-\ingroup API_Modularized_Scheduler
-create a component with mct_data paremeters. the mct component doesnt
-do anything but pushing tasks on no_perf_model_component and
-calibrating_component
-
-\fn int starpu_sched_component_is_mct(struct starpu_sched_component *component);
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Resource-mapping Heft Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_heft_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data)
-\ingroup API_Modularized_Scheduler
-	 this component perform a heft scheduling
-
-\fn int starpu_sched_component_is_heft(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-	 return true iff \p component is a heft component
-
-@name Special-purpose Best_Implementation Component API
-\ingroup API_Modularized_Scheduler
-
-\fn struct starpu_sched_component *starpu_sched_component_best_implementation_create(struct starpu_sched_tree *tree, void *arg)
-\ingroup API_Modularized_Scheduler
-	 Select the implementation that offer the shortest computation length for the first worker that can execute the task.
-	 Or an implementation that need to be calibrated.
-	 Also set starpu_task::predicted and starpu_task::predicted_transfer for memory component of the first suitable workerid.
-	 If starpu_sched_component::push method is called and starpu_sched_component::nchild > 1 the result is undefined.
-
-@name Special-purpose Perfmodel_Select Component API
-\ingroup API_Modularized_Scheduler
-
-\struct starpu_sched_component_perfmodel_select_data
-\ingroup API_Modularized_Scheduler
-\var struct starpu_sched_component *starpu_sched_component_perfmodel_select_data::calibrator_component
-todo
-\var struct starpu_sched_component *starpu_sched_component_perfmodel_select_data::no_perfmodel_component
-todo
-\var struct starpu_sched_component *starpu_sched_component_perfmodel_select_data::perfmodel_component
-todo
-
-\fn struct starpu_sched_component *starpu_sched_component_perfmodel_select_create(struct starpu_sched_tree *tree, struct starpu_sched_component_perfmodel_select_data *perfmodel_select_data)
-\ingroup API_Modularized_Scheduler
-todo
-
-\fn int starpu_sched_component_is_perfmodel_select(struct starpu_sched_component *component)
-\ingroup API_Modularized_Scheduler
-todo
-
-@name Recipe Component API
-\ingroup API_Modularized_Scheduler
-
-\struct starpu_sched_component_composed_recipe
-\ingroup API_Modularized_Scheduler
-	parameters for starpu_sched_component_composed_component_create
-
-\fn struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create(void)
-\ingroup API_Modularized_Scheduler
-	 return an empty recipe for a composed component, it should not be used without modification
-
-\fn struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create_singleton(struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg)
-\ingroup API_Modularized_Scheduler
-	 return a recipe to build a composed component with a \p create_component
-
-\fn void starpu_sched_component_composed_recipe_add(struct starpu_sched_component_composed_recipe *recipe, struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg)
-\ingroup API_Modularized_Scheduler
-	 add \p create_component under all previous components in recipe
-
-\fn void starpu_sched_component_composed_recipe_destroy(struct starpu_sched_component_composed_recipe *)
-\ingroup API_Modularized_Scheduler
-	 destroy composed_sched_component, this should be done after starpu_sched_component_composed_component_create was called
-
-\fn struct starpu_sched_component *starpu_sched_component_composed_component_create(struct starpu_sched_tree *tree, struct starpu_sched_component_composed_recipe *recipe)
-\ingroup API_Modularized_Scheduler
-	 create a component that behave as all component of recipe where linked. Except that you cant use starpu_sched_component_is_foo function
-	 if recipe contain a single create_foo arg_foo pair, create_foo(arg_foo) is returned instead of a composed component
-
-\struct starpu_sched_component_specs
-\ingroup API_Modularized_Scheduler
-	 Define how build a scheduler according to topology. Each level (except for hwloc_machine_composed_sched_component) can be <c>NULL</c>, then
-	 the level is just skipped. Bugs everywhere, do not rely on.
-\var struct starpu_sched_component_composed_recipe *starpu_sched_specs::hwloc_machine_composed_sched_component
-     the composed component to put on the top of the scheduler
-     this member must not be <c>NULL</c> as it is the root of the topology
-\var struct starpu_sched_component_composed_recipe *starpu_sched_specs::hwloc_component_composed_sched_component
-     the composed component to put for each memory component
-\var struct starpu_sched_component_composed_recipe *starpu_sched_specs::hwloc_socket_composed_sched_component
-     the composed component to put for each socket
-\var struct starpu_sched_component_composed_recipe *starpu_sched_specs::hwloc_cache_composed_sched_component
-     the composed component to put for each cache
-\var struct starpu_sched_component_composed_recipe *(*starpu_sched_specs::worker_composed_sched_component)(enum starpu_worker_archtype archtype)
-     a function that return a starpu_sched_component_composed_recipe to put on top of a worker of type \p archtype.
-     <c>NULL</c> is a valid return value, then no component will be added on top
-\var starpu_sched_specs::mix_heterogeneous_workers
-     this flag is a dirty hack because of the poor expressivity of this interface. As example, if you want to build
-     a heft component with a fifo component per numa component, and you also have GPUs, if this flag is set, GPUs will share those fifos.
-     If this flag is not set, a new fifo will be built for each of them (if they have the same starpu_perf_arch and the same
-     numa component it will be shared. it indicates if heterogenous workers should be brothers or cousins, as example, if a gpu and a cpu should share or not there numa node
-
-\fn struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_ctx_id, struct starpu_sched_component_specs s)
-\ingroup API_Modularized_Scheduler
-	 this function build a scheduler for \p sched_ctx_id according to \p s and the hwloc topology of the machine.
-
-\def STARPU_SCHED_SIMPLE_DECIDE_WORKERS
-\ingroup API_Modularized_Scheduler
-Request to create downstream queues per worker, i.e. the scheduling decision-making component will choose exactly which workers tasks should got to.
-
-\def STARPU_SCHED_SIMPLE_COMBINED_WORKERS
-\ingroup API_Modularized_Scheduler
-Request to not only choose between simple workers, but also choose between combined workers.
-
-\def STARPU_SCHED_SIMPLE_DECIDE_MEMNODES
-\ingroup API_Modularized_Scheduler
-Request to create downstream queues per memory nodes, i.e. the scheduling decision-making component will choose which memory node tasks will go to.
-
-\def STARPU_SCHED_SIMPLE_DECIDE_ARCHS
-\ingroup API_Modularized_Scheduler
-Request to create downstream queues per computation arch, i.e. the scheduling decision-making component will choose whether tasks go to CPUs, or CUDA, or OpenCL, etc.
-
-\def STARPU_SCHED_SIMPLE_PERFMODEL
-\ingroup API_Modularized_Scheduler
-Request to add a perfmodel selector above the scheduling decision-making component. That way, only tasks with a calibrated performance model will be given to the component, other tasks will go to an eager branch that will distributed tasks so that their performance models will get calibrated.
-
-In other words, this is needed when using a component which needs performance models for tasks.
-
-\def STARPU_SCHED_SIMPLE_FIFO_ABOVE
-\ingroup API_Modularized_Scheduler
-Request to create a fifo above the scheduling decision-making component, otherwise tasks will be pushed directly to the component.
-
-This is useful to store tasks if there is a fifo below which limits the number of tasks to be scheduld in advance. The scheduling decision-making component can also store tasks itself, in which case this flag is not useful.
-
-\def STARPU_SCHED_SIMPLE_FIFO_ABOVE_PRIO
-\ingroup API_Modularized_Scheduler
-Request that the fifo above be sorted by priorities
-
-\def STARPU_SCHED_SIMPLE_FIFOS_BELOW
-\ingroup API_Modularized_Scheduler
-Request to create fifos below the scheduling decision-making component, otherwise tasks will be pulled directly from workers.
-
-This is useful to be able to schedule a (tunable) small number of tasks in advance only.
-
-\def STARPU_SCHED_SIMPLE_FIFOS_BELOW_PRIO
-\ingroup API_Modularized_Scheduler
-Request that the fifos below be sorted by priorities
-
-\def STARPU_SCHED_SIMPLE_WS_BELOW
-\ingroup API_Modularized_Scheduler
-Request that work between workers using the same fifo below be distributed using a work stealing component.
-
-\def STARPU_SCHED_SIMPLE_IMPL
-\ingroup API_Modularized_Scheduler
-Request that a component be added just above workers, that chooses the best task implementation.
-
-\fn void starpu_sched_component_initialize_simple_scheduler(starpu_sched_component_create_t create_decision_component, void *data, unsigned flags, unsigned sched_ctx_id)
-\ingroup API_Modularized_Scheduler
-This creates a simple modular scheduler tree around a scheduling decision-making
-component \p component. The details of what should be built around \p component
-is described by \p flags. The different STARPU_SCHED_SIMPL_DECIDE_* flags are
-mutually exclusive. \p data is passed to the \p create_decision_component
-function when creating the decision component.
-
-\fn int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task)
-\ingroup API_Modularized_Scheduler
-Push a task to a component. This is a helper for <c>component->push_task(component, task)</c> plus tracing.
-
-\fn struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to)
-\ingroup API_Modularized_Scheduler
-Pull a task from a component. This is a helper for <c>component->pull_task(component)</c> plus tracing.
-
-*/

+ 0 - 349
doc/doxygen/chapters/api/performance_model.doxy

@@ -1,349 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2011-2013,2016                           Inria
- * Copyright (C) 2010-2017                                CNRS
- * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \defgroup API_Performance_Model Performance Model
-
-\enum starpu_perfmodel_type
-\ingroup API_Performance_Model
-TODO
-\var starpu_perfmodel_type::STARPU_PERFMODEL_INVALID
-    todo
-\var starpu_perfmodel_type::STARPU_PER_ARCH
-    Application-provided per-arch cost model function
-\var starpu_perfmodel_type::STARPU_COMMON
-    Application-provided common cost model function, with per-arch
-    factor
-\var starpu_perfmodel_type::STARPU_HISTORY_BASED
-    Automatic history-based cost model
-\var starpu_perfmodel_type::STARPU_REGRESSION_BASED
-    Automatic linear regression-based cost model  (alpha * size ^
-    beta)
-\var starpu_perfmodel_type::STARPU_NL_REGRESSION_BASED
-    Automatic non-linear regression-based cost model (a * size ^ b +
-    c)
-\var starpu_perfmodel_type::STARPU_MULTIPLE_REGRESSION_BASED
-    Automatic multiple linear regression-based cost model. Application
-    provides parameters, their combinations and exponents.
-
-\struct starpu_perfmodel_device
-todo
-\ingroup API_Performance_Model
-\var enum starpu_worker_archtype starpu_perfmodel_device::type
-    type of the device
-\var int starpu_perfmodel_device::devid
-    identifier of the precise device
-\var int starpu_perfmodel_device::ncore
-    number of execution in parallel, minus 1
-
-\struct starpu_perfmodel_arch
-todo
-\ingroup API_Performance_Model
-\var int starpu_perfmodel_arch::ndevices
-    number of the devices for the given arch
-\var struct starpu_perfmodel_device *starpu_perfmodel_arch::devices
-    list of the devices for the given arch
-
-\struct starpu_perfmodel
-Contain all information about a performance model. At least the
-type and symbol fields have to be filled when defining a performance
-model for a codelet. For compatibility, make sure to initialize the
-whole structure to zero, either by using explicit memset, or by
-letting the compiler implicitly do it in e.g. static storage case. If
-not provided, other fields have to be zero.
-\ingroup API_Performance_Model
-\var enum starpu_perfmodel_type starpu_perfmodel::type
-    type of performance model
-    <ul>
-    <li>
-    ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED,
-    ::STARPU_NL_REGRESSION_BASED: No other fields needs to be
-    provided, this is purely history-based.
-    </li>
-    <li>
-    ::STARPU_MULTIPLE_REGRESSION_BASED: Need to provide fields
-    starpu_perfmodel::nparameters (number of different parameters),
-    starpu_perfmodel::ncombinations (number of parameters
-    combinations-tuples) and table starpu_perfmodel::combinations
-    which defines exponents of the equation. Function cl_perf_func
-    also needs to define how to extract parameters from the task. 
-    </li>
-    <li>
-    ::STARPU_PER_ARCH: either field
-    starpu_perfmodel::arch_cost_function has to be filled with a
-    function that returns the cost in micro-seconds on the arch given
-    as parameter, or field starpu_perfmodel::per_arch has to be filled
-    with functions which return the cost in micro-seconds.
-    </li>
-    <li>
-    ::STARPU_COMMON: field starpu_perfmodel::cost_function has to be
-    filled with a function that returns the cost in micro-seconds on a
-    CPU, timing on other archs will be determined by multiplying by an
-    arch-specific factor.
-    </li>
-    </ul>
-\var const char *starpu_perfmodel::symbol
-    symbol name for the performance model, which will be used as file
-    name to store the model. It must be set otherwise the model will
-    be ignored.
-\var double (*starpu_perfmodel::cost_function)(struct starpu_task *, unsigned nimpl)
-    Used by ::STARPU_COMMON. Take a task and implementation number,
-    and must return a task duration estimation in micro-seconds.
-\var double (*starpu_perfmodel::arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl)
-    Used by ::STARPU_COMMON. Take a task, an arch and implementation
-    number, and must return a task duration estimation in
-    micro-seconds on that arch.
-\var size_t (*starpu_perfmodel::size_base)(struct starpu_task *, unsigned nimpl)
-    Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
-    ::STARPU_NL_REGRESSION_BASED. If not <c>NULL</c>, take a task and
-    implementation number, and return the size to be used as index to
-    distinguish histories and as a base for regressions.
-\var uint32_t (*starpu_perfmodel::footprint)(struct starpu_task *)
-    Used by ::STARPU_HISTORY_BASED. If not <c>NULL</c>, take a task
-    and return the footprint to be used as index to distinguish
-    histories. The default is to use the starpu_task_data_footprint()
-    function.
-\var unsigned starpu_perfmodel::is_loaded
-\private
-    Whether the performance model is already loaded from the disk.
-\var unsigned starpu_perfmodel::benchmarking
-\private
-    todo
-\var unsigned starpu_perfmodel::is_init
-    todo
-\var starpu_perfmodel_state_t starpu_perfmodel::state
-\private
-    todo
-\var void (*starpu_perfmodel::parameters)(struct starpu_task * task, double *parameters);
-    todo
-\var const char ** starpu_perfmodel::parameters_names
-\private
-    Names of parameters used for multiple linear regression models (M,
-    N, K)
-\var unsigned starpu_perfmodel::nparameters
-\private
-    Number of parameters used for multiple linear regression models
-\var unsigned ** starpu_perfmodel::combinations
-\private
-    Table of combinations of parameters (and the exponents) used for
-    multiple linear regression models
-\var unsigned starpu_perfmodel::ncombinations
-\private
-    Number of combination of parameters used for multiple linear
-    regression models
-
-\struct starpu_perfmodel_regression_model
-todo
-\ingroup API_Performance_Model
-\var double starpu_perfmodel_regression_model::sumlny
-    sum of ln(measured)
-\var double starpu_perfmodel_regression_model::sumlnx
-    sum of ln(size)
-\var double starpu_perfmodel_regression_model::sumlnx2
-    sum of ln(size)^2
-\var unsigned long starpu_perfmodel_regression_model::minx
-    minimum size
-\var unsigned long starpu_perfmodel_regression_model::maxx
-    maximum size
-\var double starpu_perfmodel_regression_model::sumlnxlny
-    sum of ln(size)*ln(measured)
-\var double starpu_perfmodel_regression_model::alpha
-    estimated = alpha * size ^ beta
-\var double starpu_perfmodel_regression_model::beta
-    estimated = alpha * size ^ beta
-\var unsigned starpu_perfmodel_regression_model::valid
-    whether the linear regression model is valid (i.e. enough measures)
-\var double starpu_perfmodel_regression_model::a
-    estimated = a size ^b + c
-\var double starpu_perfmodel_regression_model::b
-    estimated = a size ^b + c
-\var double starpu_perfmodel_regression_model::c
-    estimated = a size ^b + c
-\var unsigned starpu_perfmodel_regression_model::nl_valid
-    whether the non-linear regression model is valid (i.e. enough measures)
-\var unsigned starpu_perfmodel_regression_model::nsample
-    number of sample values for non-linear regression
-\var double starpu_perfmodel_regression_model::coeff[]
-    list of computed coefficients for multiple linear regression model
-\var double starpu_perfmodel_regression_model::ncoeff
-    number of coefficients for multiple linear regression model
-\var double starpu_perfmodel_regression_model::multi_valid
-    whether the multiple linear regression model is valid
-
-\struct starpu_perfmodel_per_arch
-contains information about the performance model of a given
-arch.
-\ingroup API_Performance_Model
-\var starpu_perfmodel_per_arch_cost_function starpu_perfmodel_per_arch::cost_function
-    Used by ::STARPU_PER_ARCH, must point to functions which take a
-    task, the target arch and implementation number (as mere
-    conveniency, since the array is already indexed by these), and
-    must return a task duration estimation in micro-seconds.
-\var starpu_perfmodel_per_arch_size_base starpu_perfmodel_per_arch::size_base
-    Same as in structure starpu_perfmodel, but per-arch, in case it
-    depends on the architecture-specific implementation.
-\var struct starpu_perfmodel_history_table *starpu_perfmodel_per_arch::history
-\private
-    The history of performance measurements.
-\var struct starpu_perfmodel_history_list *starpu_perfmodel_per_arch::list
-\private
-    Used by ::STARPU_HISTORY_BASED, ::STARPU_NL_REGRESSION_BASED and
-    ::STARPU_MULTIPLE_REGRESSION_BASED, records all execution history
-    measures.
-\var struct starpu_perfmodel_regression_model starpu_perfmodel_per_arch::regression
-\private
-    Used by ::STARPU_REGRESSION_BASED, ::STARPU_NL_REGRESSION_BASED
-    and ::STARPU_MULTIPLE_REGRESSION_BASED, contains the estimated
-    factors of the regression.
-
-\struct starpu_perfmodel_history_list
-todo
-\ingroup API_Performance_Model
-\var struct starpu_perfmodel_history_list *starpu_perfmodel_history_list::next
-    todo
-\var struct starpu_perfmodel_history_entry *starpu_perfmodel_history_list::entry
-    todo
-
-\struct starpu_perfmodel_history_entry
-todo
-\ingroup API_Performance_Model
-\var double starpu_perfmodel_history_entry::mean
-    mean_n = 1/n sum
-\var double starpu_perfmodel_history_entry::deviation
-    n dev_n = sum2 - 1/n (sum)^2
-\var double starpu_perfmodel_history_entry::sum
-    sum of samples (in µs)
-\var double starpu_perfmodel_history_entry::sum2
-    sum of samples^2
-\var unsigned starpu_perfmodel_history_entry::nsample
-    number of samples
-\var uint32_t starpu_perfmodel_history_entry::footprint
-    data footprint
-\var size_t starpu_perfmodel_history_entry::size
-    in bytes
-\var double starpu_perfmodel_history_entry::flops
-    Provided by the application
-
-\fn void starpu_perfmodel_init(struct starpu_perfmodel *model)
-\ingroup API_Performance_Model
-todo
-
-\fn void starpu_perfmodel_free_sampling_directories(void)
-\ingroup API_Performance_Model
-Free internal memory used for sampling directory
-management. It should only be called by an application which is not
-calling starpu_shutdown() as this function already calls it. See for
-example <c>tools/starpu_perfmodel_display.c</c>.
-
-\fn int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *model)
-\ingroup API_Performance_Model
-Load the performance model found in the file named \p filename. \p model has to be
-completely zero, and will be filled with the information stored in the given file.
-
-\fn int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
-\ingroup API_Performance_Model
-Load a given performance model. \p model has to be
-completely zero, and will be filled with the information stored in
-<c>$STARPU_HOME/.starpu</c>. The function is intended to be used by
-external tools that want to read the performance model files.
-
-\fn int starpu_perfmodel_unload_model(struct starpu_perfmodel *model)
-\ingroup API_Performance_Model
-Unload \p model which has been previously loaded
-through the function starpu_perfmodel_load_symbol()
-
-\fn void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl)
-\ingroup API_Performance_Model
-Return the path to the debugging information for the performance model.
-
-\fn char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
-\ingroup API_Performance_Model
-todo
-
-\fn void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl)
-\ingroup API_Performance_Model
-Return the architecture name for \p arch
-
-\fn struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id)
-\ingroup API_Performance_Model
-Return the architecture type of the worker \p workerid.
-
-\fn void starpu_perfmodel_initialize(void)
-\ingroup API_Performance_Model
-If starpu_init is not used, starpu_perfmodel_initialize should be used before calling starpu_perfmodel_* functions.
-
-\fn int starpu_perfmodel_list(FILE *output)
-\ingroup API_Performance_Model
-Print a list of all performance models on \p output
-
-\fn void starpu_perfmodel_directory(FILE *output)
-\ingroup API_Performance_Model
-Print the directory name storing performance models on \p output
-
-\fn void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
-\ingroup API_Performance_Model
-todo
-
-\fn int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output)
-\ingroup API_Performance_Model
-todo
-
-\fn int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t footprint, FILE *output)
-\ingroup API_Performance_Model
-todo
-
-\fn void starpu_bus_print_bandwidth(FILE *f)
-\ingroup API_Performance_Model
-Print a matrix of bus bandwidths on \p f.
-
-\fn void starpu_bus_print_affinity(FILE *f)
-\ingroup API_Performance_Model
-Print the affinity devices on \p f.
-
-\fn void starpu_bus_print_filenames(FILE *f)
-\ingroup API_Performance_Model
-Print on \p f the name of the files containing the matrix of bus bandwidths, the affinity devices and the latency.
-
-\fn void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
-\ingroup API_Performance_Model
-Feed the performance model model with an explicit
-measurement measured (in µs), in addition to measurements done by StarPU
-itself. This can be useful when the application already has an
-existing set of measurements done in good conditions, that StarPU
-could benefit from instead of doing on-line measurements. An example
-of use can be seen in \ref PerformanceModelExample.
-
-\fn double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
-\ingroup API_Performance_Model
-Return the bandwidth of data transfer between two memory nodes
-
-\fn double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
-\ingroup API_Performance_Model
-Return the latency of data transfer between two memory nodes
-
-\fn double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
-\ingroup API_Performance_Model
-Return the estimated time to transfer a given size between two memory nodes.
-
-\fn double starpu_perfmodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint)
-\ingroup API_Performance_Model
-Return the estimated time of a task with the given model and the given footprint.
-
-\var starpu_perfmodel_nop
-Performance model which just always return 1µs.
-
-*/

+ 0 - 202
doc/doxygen/chapters/api/profiling.doxy

@@ -1,202 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010-2015,2017                           CNRS
- * Copyright (C) 2009-2011,2014,2016,2018-2019            Université de Bordeaux
- * Copyright (C) 2011,2012                                Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \defgroup API_Profiling Profiling
-
-\struct starpu_profiling_task_info
-\ingroup API_Profiling
-This structure contains information about the execution of a
-task. It is accessible from the field starpu_task::profiling_info if
-profiling was enabled.
-\var struct timespec starpu_profiling_task_info::submit_time
-    Date of task submission (relative to the initialization of StarPU).
-
-\var struct timespec starpu_profiling_task_info::push_start_time
-    Time when the task was submitted to the scheduler.
-
-\var struct timespec starpu_profiling_task_info::push_end_time
-    Time when the scheduler finished with the task submission.
-
-\var struct timespec starpu_profiling_task_info::pop_start_time
-    Time when the scheduler started to be requested for a task, and eventually gave that task.
-
-\var struct timespec starpu_profiling_task_info::pop_end_time
-    Time when the scheduler finished providing the task for execution.
-
-\var struct timespec starpu_profiling_task_info::acquire_data_start_time
-    Time when the worker started fetching input data.
-
-\var struct timespec starpu_profiling_task_info::acquire_data_end_time
-    Time when the worker finished fetching input data.
-
-\var struct timespec starpu_profiling_task_info::start_time
-    Date of task execution beginning (relative to the initialization of StarPU).
-
-\var struct timespec starpu_profiling_task_info::end_time
-    Date of task execution termination (relative to the initialization of StarPU).
-
-\var struct timespec starpu_profiling_task_info::release_data_start_time
-    Time when the worker started releasing data.
-
-\var struct timespec starpu_profiling_task_info::release_data_end_time
-    Time when the worker finished releasing data.
-
-\var struct timespec starpu_profiling_task_info::callback_start_time
-    Time when the worker started the application callback for the task.
-
-\var struct timespec starpu_profiling_task_info::callback_end_time
-    Time when the worker finished the application callback for the task.
-
-\var int starpu_profiling_task_info::workerid
-    Identifier of the worker which has executed the task.
-
-\var uint64_t starpu_profiling_task_info::used_cycles
-    Number of cycles used by the task, only available in the MoviSim
-
-\var uint64_t starpu_profiling_task_info::stall_cycles
-    Number of cycles stalled within the task, only available in the MoviSim
-
-\var double starpu_profiling_task_info::energy_consumed
-Energy consumed by the task, in Joules
-
-\struct starpu_profiling_worker_info
-This structure contains the profiling information associated to
-a worker. The timing is provided since the previous call to
-starpu_profiling_worker_get_info()
-\ingroup API_Profiling
-\var struct timespec starpu_profiling_worker_info::start_time
-        Starting date for the reported profiling measurements.
-\var struct timespec starpu_profiling_worker_info::total_time
-        Duration of the profiling measurement interval.
-\var struct timespec starpu_profiling_worker_info::executing_time
-        Time spent by the worker to execute tasks during the profiling measurement interval.
-\var struct timespec starpu_profiling_worker_info::sleeping_time
-        Time spent idling by the worker during the profiling measurement interval.
-\var int starpu_profiling_worker_info::executed_tasks
-        Number of tasks executed by the worker during the profiling measurement interval.
-\var uint64_t starpu_profiling_worker_info::used_cycles
-        Number of cycles used by the worker, only available in the MoviSim
-\var uint64_t starpu_profiling_worker_info::stall_cycles
-        Number of cycles stalled within the worker, only available in the MoviSim
-\var double starpu_profiling_worker_info::energy_consumed
-        Energy consumed by the worker, in Joules
-
-\struct starpu_profiling_bus_info
-todo
-\ingroup API_Profiling
-\var struct timespec starpu_profiling_bus_info::start_time
-        Time of bus profiling startup.
-\var struct timespec starpu_profiling_bus_info::total_time
-        Total time of bus profiling.
-\var int long long starpu_profiling_bus_info::transferred_bytes
-        Number of bytes transferred during profiling.
-\var int starpu_profiling_bus_info::transfer_count
-        Number of transfers during profiling.
-
-\typedef STARPU_PROFILING_DISABLE
-\ingroup API_Profiling
-Used when calling the function starpu_profiling_status_set() to disable profiling.
-
-\typedef STARPU_PROFILING_ENABLE
-\ingroup API_Profiling
-Used when calling the function starpu_profiling_status_set() to enable profiling.
-
-\fn int starpu_profiling_status_set(int status)
-\ingroup API_Profiling
-Set the profiling status. Profiling is activated
-by passing \ref STARPU_PROFILING_ENABLE in \p status. Passing
-\ref STARPU_PROFILING_DISABLE disables profiling. Calling this function
-resets all profiling measurements. When profiling is enabled, the
-field starpu_task::profiling_info points to a valid structure
-starpu_profiling_task_info containing information about the execution
-of the task. Negative return values indicate an error, otherwise the
-previous status is returned.
-
-\fn int starpu_profiling_status_get(void)
-\ingroup API_Profiling
-Return the current profiling status or a negative value in case
-there was an error.
-
-\fn void starpu_profiling_init(void)
-\ingroup API_Profiling
-Reset performance counters and enable profiling if the
-environment variable \ref STARPU_PROFILING is set to a positive value.
-
-\fn void starpu_profiling_set_id(int new_id)
-\ingroup API_Profiling
-Set the ID used for profiling trace filename. HAS to be called before starpu_init().
-
-\fn int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info)
-\ingroup API_Profiling
-Get the profiling info associated to the worker identified by
-\p workerid, and reset the profiling measurements. If the argument \p
-worker_info is <c>NULL</c>, only reset the counters associated to worker
-\p workerid. Upon successful completion, this function returns 0.
-Otherwise, a negative value is returned.
-
-\fn int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info)
-\ingroup API_Profiling
-
-todo
-
-See _starpu_profiling_bus_helper_display_summary in src/profiling/profiling_helpers.c for a usage example.
-Note that calling starpu_bus_get_profiling_info resets the counters to zero.
-
-\fn int starpu_bus_get_count(void)
-\ingroup API_Profiling
-Return the number of buses in the machine
-
-\fn int starpu_bus_get_id(int src, int dst)
-\ingroup API_Profiling
-Return the identifier of the bus between \p src and \p dst
-
-\fn int starpu_bus_get_src(int busid)
-\ingroup API_Profiling
-Return the source point of bus \p busid
-
-\fn int starpu_bus_get_dst(int busid)
-\ingroup API_Profiling
-Return the destination point of bus \p busid
-
-\fn double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end)
-\ingroup API_Profiling
-Return the time elapsed between \p start and \p end in microseconds.
-
-\fn double starpu_timing_timespec_to_us(struct timespec *ts)
-\ingroup API_Profiling
-Convert the given timespec \p ts into microseconds
-
-\fn void starpu_profiling_bus_helper_display_summary(void)
-\ingroup API_Profiling
-Display statistics about the bus on \c stderr. if the environment
-variable \ref STARPU_BUS_STATS is defined. The function is called
-automatically by starpu_shutdown().
-
-\fn void starpu_profiling_worker_helper_display_summary(void)
-\ingroup API_Profiling
-Displays statistic about the workers on \c stderr if the
-environment variable \ref STARPU_WORKER_STATS is defined. The function is
-called automatically by starpu_shutdown().
-
-\fn void starpu_data_display_memory_stats()
-\ingroup API_Profiling
-Display statistics about the current data handles registered
-within StarPU. StarPU must have been configured with the configure
-option \ref enable-memory-stats "--enable-memory-stats" (see \ref MemoryFeedback).
-
-*/

+ 2 - 18
doc/doxygen/chapters/api/scc_extensions.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2019                      CNRS
  * Copyright (C) 2009-2011,2014                           Université de Bordeaux
  * Copyright (C) 2011,2012                                Inria
  *
@@ -16,7 +16,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-/*! \defgroup API_SCC_Extensions SCC Extensions
+/*! \ingroup API_SCC_Extensions
 
 \def STARPU_USE_SCC
 \ingroup API_SCC_Extensions
@@ -28,20 +28,4 @@ It should be used in your code to detect the availability of SCC.
 Define the maximum number of SCC devices that are
 supported by StarPU.
 
-\typedef starpu_scc_func_symbol_t
-\ingroup API_SCC_Extensions
-Type for SCC function symbols
-
-\fn int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name)
-\ingroup API_SCC_Extensions
-Initiate a lookup on each SCC device to find the adress of the
-function named \p func_name, store them in the global array kernels
-and return the index in the array through \p symbol.
-
-\fn starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t symbol)
-\ingroup API_SCC_Extensions
-If success, return the pointer to the function defined by \p symbol on
-the device linked to the called device. This can for instance be used
-in a starpu_scc_func_symbol_t implementation.
-
 */

+ 5 - 6
include/starpu_bitmap.h

@@ -19,11 +19,10 @@
 #ifndef __STARPU_BITMAP_H__
 #define __STARPU_BITMAP_H__
 
-/** @defgroup API_Bitmap Bitmap
-
-    @brief This is the interface for the bitmap utilities provided by StarPU.
-
-    @{
+/**
+   @defgroup API_Bitmap Bitmap
+   @brief This is the interface for the bitmap utilities provided by StarPU.
+   @{
  */
 
 #ifdef __cplusplus
@@ -33,7 +32,7 @@ extern "C"
 
 /** create a empty starpu_bitmap */
 struct starpu_bitmap *starpu_bitmap_create(void) STARPU_ATTRIBUTE_MALLOC;
-/** free \b */
+/** free \p b */
 void starpu_bitmap_destroy(struct starpu_bitmap *b);
 
 /** set bit \p e in \p b */

+ 31 - 20
include/starpu_bound.h

@@ -18,12 +18,11 @@
 #ifndef __STARPU_BOUND_H__
 #define __STARPU_BOUND_H__
 
-/** @defgroup API_Theoretical_Lower_Bound_on_Execution_Time Theoretical Lower Bound on Execution Time
-
-    @brief Compute theoretical upper computation efficiency bound corresponding to some actual execution.
-
-    @{
- */
+/**
+   @defgroup API_Theoretical_Lower_Bound_on_Execution_Time Theoretical Lower Bound on Execution Time
+   @brief Compute theoretical upper computation efficiency bound corresponding to some actual execution.
+   @{
+*/
 
 #include <stdio.h>
 
@@ -32,34 +31,46 @@ extern "C"
 {
 #endif
 
-/** Start recording tasks (resets stats). \p deps tells whether dependencies should be recorded too (this is quite expensive) */
+/**
+   Start recording tasks (resets stats). \p deps tells whether
+   dependencies should be recorded too (this is quite expensive)
+*/
 void starpu_bound_start(int deps, int prio);
-/** Stop recording tasks */
+
+/**
+   Stop recording tasks
+*/
 void starpu_bound_stop(void);
 
-/** Emit the DAG that was recorded on \p output. */
+/**
+   Emit the DAG that was recorded on \p output.
+*/
 void starpu_bound_print_dot(FILE *output);
 
-/** Get theoretical upper bound (in ms) (needs glpk support
-    detected by configure script). It returns 0 if some performance models
-    are not calibrated.
+/**
+   Get theoretical upper bound (in ms) (needs glpk support detected by
+   configure script). It returns 0 if some performance models are not
+   calibrated.
 */
 void starpu_bound_compute(double *res, double *integer_res, int integer);
 
-/** Emit the Linear Programming system on \p output for the recorded
-    tasks, in the lp format
+/**
+   Emit the Linear Programming system on \p output for the recorded
+   tasks, in the lp format
 */
 void starpu_bound_print_lp(FILE *output);
 
-/** Emit the Linear Programming system on \p output for the recorded
-    tasks, in the mps format
+/**
+   Emit the Linear Programming system on \p output for the recorded
+   tasks, in the mps format
 */
 void starpu_bound_print_mps(FILE *output);
 
-/** Emit on \p output the statistics of actual execution vs theoretical upper bound.
-    \p integer permits to choose between integer solving (which takes a
-    long time but is correct), and relaxed solving (which provides an
-    approximate solution).
+/**
+   Emit on \p output the statistics of actual execution vs theoretical
+   upper bound. \p integer permits to choose between integer solving
+   (which takes a long time but is correct), and relaxed solving
+   (which provides an approximate solution).
 */
 void starpu_bound_print(FILE *output, int integer);
 

+ 3 - 3
include/starpu_clusters.h

@@ -19,9 +19,9 @@
 #ifndef __STARPU_CLUSTERS_UTIL_H__
 #define __STARPU_CLUSTERS_UTIL_H__
 
-/** @defgroup API_Clustering_Machine Clustering Machine
-
-    @{
+/**
+   @defgroup API_Clustering_Machine Clustering Machine
+   @{
  */
 
 #ifdef STARPU_HAVE_HWLOC

+ 8 - 7
include/starpu_cublas.h

@@ -18,9 +18,9 @@
 #ifndef __STARPU_CUBLAS_H__
 #define __STARPU_CUBLAS_H__
 
-/** @ingroup API_CUDA_Extensions
-
-    @{
+/**
+   @ingroup API_CUDA_Extensions
+   @{
  */
 
 #ifdef __cplusplus
@@ -38,10 +38,11 @@ extern "C"
 void starpu_cublas_init(void);
 
 /**
-   Set the proper CUBLAS stream for CUBLAS v1. This must be called from the CUDA
-   codelet before calling CUBLAS v1 kernels, so that they are queued on the proper
-   CUDA stream. When using one thread per CUDA worker, this function does not
-   do anything since the CUBLAS stream does not change, and is set once by
+   Set the proper CUBLAS stream for CUBLAS v1. This must be called
+   from the CUDA codelet before calling CUBLAS v1 kernels, so that
+   they are queued on the proper CUDA stream. When using one thread
+   per CUDA worker, this function does not do anything since the
+   CUBLAS stream does not change, and is set once by
    starpu_cublas_init().
 */
 void starpu_cublas_set_stream(void);

+ 5 - 5
include/starpu_cublas_v2.h

@@ -18,9 +18,9 @@
 #ifndef __STARPU_CUBLAS_V2_H__
 #define __STARPU_CUBLAS_V2_H__
 
-/** @ingroup API_CUDA_Extensions
-
-    @{
+/**
+   @ingroup API_CUDA_Extensions
+   @{
  */
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
@@ -33,8 +33,8 @@ extern "C"
 #endif
 
 /**
-   Return the CUSPARSE handle to be used to queue CUSPARSE
-   kernels. It is properly initialized and configured for multistream by
+   Return the CUSPARSE handle to be used to queue CUSPARSE kernels. It
+   is properly initialized and configured for multistream by
    starpu_cusparse_init().
 */
 cublasHandle_t starpu_cublas_get_local_handle(void);

+ 38 - 27
include/starpu_cuda.h

@@ -19,9 +19,9 @@
 #ifndef __STARPU_CUDA_H__
 #define __STARPU_CUDA_H__
 
-/** @defgroup API_CUDA_Extensions CUDA Extensions
-
-    @{
+/**
+   @defgroup API_CUDA_Extensions CUDA Extensions
+   @{
  */
 
 #include <starpu_config.h>
@@ -36,49 +36,60 @@ extern "C"
 {
 #endif
 
-/** Report a CUBLAS error. */
+/**
+   Report a CUBLAS error.
+*/
 void starpu_cublas_report_error(const char *func, const char *file, int line, int status);
 
-/** Calls starpu_cublas_report_error(), passing the current function, file and line position.*/
+/**
+   Call starpu_cublas_report_error(), passing the current function, file and line position.
+*/
 #define STARPU_CUBLAS_REPORT_ERROR(status) starpu_cublas_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
-/** Report a CUDA error. */
+/**
+   Report a CUDA error.
+*/
 void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status);
 
-/** Calls starpu_cuda_report_error(), passing the current function, file and line position.*/
+/**
+   Call starpu_cuda_report_error(), passing the current function, file and line position.
+*/
 #define STARPU_CUDA_REPORT_ERROR(status) starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
 /**
-    Return the current worker’s CUDA stream. StarPU
-    provides a stream for every CUDA device controlled by StarPU. This
-    function is only provided for convenience so that programmers can
-    easily use asynchronous operations within codelets without having to
-    create a stream by hand. Note that the application is not forced to
-    use the stream provided by starpu_cuda_get_local_stream() and may also
-    create its own streams. Synchronizing with <c>cudaThreadSynchronize()</c> is
-    allowed, but will reduce the likelihood of having all transfers
-    overlapped.
+   Return the current worker’s CUDA stream. StarPU provides a stream
+   for every CUDA device controlled by StarPU. This function is only
+   provided for convenience so that programmers can easily use
+   asynchronous operations within codelets without having to create a
+   stream by hand. Note that the application is not forced to use the
+   stream provided by starpu_cuda_get_local_stream() and may also
+   create its own streams. Synchronizing with
+   <c>cudaThreadSynchronize()</c> is allowed, but will reduce the
+   likelihood of having all transfers overlapped.
 */
 cudaStream_t starpu_cuda_get_local_stream(void);
 
-/** Return a pointer to device properties for worker \p workerid (assumed to be a CUDA worker). */
+/**
+   Return a pointer to device properties for worker \p workerid
+   (assumed to be a CUDA worker).
+*/
 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
 
 /**
-    Copy \p ssize bytes from the pointer \p src_ptr on \p src_node
-    to the pointer \p dst_ptr on \p dst_node. The function first tries to
-    copy the data asynchronous (unless \p stream is <c>NULL</c>). If the
-    asynchronous copy fails or if \p stream is <c>NULL</c>, it copies the
-    data synchronously. The function returns <c>-EAGAIN</c> if the
-    asynchronous launch was successfull. It returns 0 if the synchronous
-    copy was successful, or fails otherwise.
+   Copy \p ssize bytes from the pointer \p src_ptr on \p src_node
+   to the pointer \p dst_ptr on \p dst_node. The function first tries to
+   copy the data asynchronous (unless \p stream is <c>NULL</c>). If the
+   asynchronous copy fails or if \p stream is <c>NULL</c>, it copies the
+   data synchronously. The function returns <c>-EAGAIN</c> if the
+   asynchronous launch was successfull. It returns 0 if the synchronous
+   copy was successful, or fails otherwise.
 */
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 
 /**
-    Calls <c>cudaSetDevice(\p devid)</c> or <c>cudaGLSetGLDevice(\p devid)</c>,
-    according to whether \p devid is among the field
-    starpu_conf::cuda_opengl_interoperability.
+   Call <c>cudaSetDevice(\p devid)</c> or <c>cudaGLSetGLDevice(\p devid)</c>,
+   according to whether \p devid is among the field
+   starpu_conf::cuda_opengl_interoperability.
 */
 void starpu_cuda_set_device(unsigned devid);
 

+ 6 - 6
include/starpu_cusparse.h

@@ -18,10 +18,10 @@
 #ifndef __STARPU_CUSPARSE_H__
 #define __STARPU_CUSPARSE_H__
 
-/** @ingroup API_CUDA_Extensions
-
-    @{
- */
+/**
+   @ingroup API_CUDA_Extensions
+   @{
+*/
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 #include <cusparse.h>
@@ -40,14 +40,14 @@ extern "C"
 void starpu_cusparse_init(void);
 
 /**
-   Synchronously deinitialize the CUSPARSE library on
+   @brief Synchronously deinitialize the CUSPARSE library on
    every CUDA device.
 */
 void starpu_cusparse_shutdown(void);
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 /**
-   Return the CUSPARSE handle to be used to queue CUSPARSE
+   @brief Return the CUSPARSE handle to be used to queue CUSPARSE
    kernels. It is properly initialized and configured for multistream by
    starpu_cusparse_init().
 */

+ 11 - 12
include/starpu_data.h

@@ -19,14 +19,13 @@
 #ifndef __STARPU_DATA_H__
 #define __STARPU_DATA_H__
 
-/** @defgroup API_Data_Management Data Management
-
-    @brief Data management facilities provided by StarPU. We show how
-    to use existing data interfaces in \ref API_Data_Interfaces, but
-    developers can design their own data interfaces if required.
-
-    @{
- */
+/**
+   @defgroup API_Data_Management Data Management
+   @brief Data management facilities provided by StarPU. We show how
+   to use existing data interfaces in \ref API_Data_Interfaces, but
+   developers can design their own data interfaces if required.
+   @{
+*/
 
 #include <starpu.h>
 
@@ -52,12 +51,12 @@ struct _starpu_data_state;
 typedef struct _starpu_data_state* starpu_data_handle_t;
 
 /**
-    Describe a StarPU data access mode
+   Describe a StarPU data access mode
 
-    Note: when adding a flag here, update
-    _starpu_detect_implicit_data_deps_with_handle
+   Note: when adding a flag here, update
+   _starpu_detect_implicit_data_deps_with_handle
 
-    Note: other STARPU_* values in include/starpu_task_util.h
+   Note: other STARPU_* values in include/starpu_task_util.h
  */
 enum starpu_data_access_mode
 {

+ 89 - 81
include/starpu_data_filters.h

@@ -21,10 +21,10 @@
 #ifndef __STARPU_DATA_FILTERS_H__
 #define __STARPU_DATA_FILTERS_H__
 
-/** @defgroup API_Data_Partition Data Partition
-
-    @{
- */
+/**
+   @defgroup API_Data_Partition Data Partition
+   @{
+*/
 
 #include <starpu.h>
 #include <stdarg.h>
@@ -36,7 +36,9 @@ extern "C"
 
 struct starpu_data_interface_ops;
 
-/** Describe a data partitioning operation, to be given to starpu_data_partition() */
+/**
+   Describe a data partitioning operation, to be given to starpu_data_partition()
+*/
 struct starpu_data_filter
 {
 	/**
@@ -99,37 +101,38 @@ struct starpu_data_filter
 	void *filter_arg_ptr;
 };
 
-/** @name Basic API
- *
- * @{
- */
+/**
+   @name Basic API
+   @{
+*/
 
 /**
-    Request the partitioning of \p initial_handle into several subdata
-    according to the filter \p f.
-    Here an example of how to use the function.
-    \code{.c}
-    struct starpu_data_filter f =
-    {
-      .filter_func = starpu_matrix_filter_block,
-      .nchildren = nslicesx
-    };
-    starpu_data_partition(A_handle, &f);
+   Request the partitioning of \p initial_handle into several subdata
+   according to the filter \p f.
+
+   Here an example of how to use the function.
+   \code{.c}
+   struct starpu_data_filter f =
+   {
+     .filter_func = starpu_matrix_filter_block,
+     .nchildren = nslicesx
+   };
+   starpu_data_partition(A_handle, &f);
     \endcode
 */
 void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f);
 
 /**
-   Unapply the filter which has been applied to \p root_data, thus
-   unpartitioning the data. The pieces of data are collected back into
-   one big piece in the \p gathering_node (usually ::STARPU_MAIN_RAM).
-   Tasks working on the partitioned data will be waited for
-   by starpu_data_unpartition().
+  Unapply the filter which has been applied to \p root_data, thus
+  unpartitioning the data. The pieces of data are collected back into
+  one big piece in the \p gathering_node (usually ::STARPU_MAIN_RAM).
+  Tasks working on the partitioned data will be waited for
+  by starpu_data_unpartition().
 
-   Here an example of how to use the function.
-   \code{.c}
-   starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
-   \endcode
+  Here an example of how to use the function.
+  \code{.c}
+  starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+  \endcode
 */
 void starpu_data_unpartition(starpu_data_handle_t root_data, unsigned gathering_node);
 
@@ -182,10 +185,10 @@ void starpu_data_vmap_filters(starpu_data_handle_t root_data, unsigned nfilters,
 
 /** @} */
 
-/** @name Asynchronous API
- *
- * @{
- */
+/**
+   @name Asynchronous API
+   @{
+*/
 
 /**
    Plan to partition \p initial_handle into several subdata according to
@@ -303,44 +306,47 @@ void starpu_data_partition_not_automatic(starpu_data_handle_t handle);
 
 /** @} */
 
-/** @name Predefined BCSR Filter Functions
- * Predefined partitioning functions for BCSR data. Examples on how to
- * use them are shown in \ref PartitioningData.
- * @{
- */
+/**
+   @name Predefined BCSR Filter Functions
+   Predefined partitioning functions for BCSR data. Examples on how to
+   use them are shown in \ref PartitioningData.
+   @{
+*/
 
 /**
    Partition a block-sparse matrix into dense matrices.
- */
+*/
 void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /** @} */
 
-/** @name Predefined CSR Filter Functions
- * Predefined partitioning functions for CSR data. Examples on how to
- * use them are shown in \ref PartitioningData.
- * @{
- */
+/**
+   @name Predefined CSR Filter Functions
+   Predefined partitioning functions for CSR data. Examples on how to
+   use them are shown in \ref PartitioningData.
+   @{
+*/
 
 /**
    Partition a block-sparse matrix into vertical block-sparse matrices.
- */
+*/
 void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /** @} */
 
-/** @name Predefined Matrix Filter Functions
- * Predefined partitioning functions for matrix
- * data. Examples on how to use them are shown in \ref
- * PartitioningData.
- * @{
- */
+/**
+   @name Predefined Matrix Filter Functions
+   Predefined partitioning functions for matrix
+   data. Examples on how to use them are shown in \ref
+   PartitioningData.
+   @{
+*/
 
 /**
    Partition a dense Matrix along the x dimension, thus getting (x/\p
    nparts ,y) matrices. If \p nparts does not divide x, the last
    submatrix contains the remainder.
- */
+*/
 void starpu_matrix_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /**
@@ -353,14 +359,14 @@ void starpu_matrix_filter_block(void *father_interface, void *child_interface, s
    only be used for read-only access, as no coherency is enforced for the
    shadowed parts. A usage example is available in
    examples/filters/shadow2d.c
- */
+*/
 void starpu_matrix_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /**
    Partition a dense Matrix along the y dimension, thus getting
    (x,y/\p nparts) matrices. If \p nparts does not divide y, the last
    submatrix contains the remainder.
- */
+*/
 void starpu_matrix_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /**
@@ -377,18 +383,19 @@ void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *ch
 
 /** @} */
 
-/** @name Predefined Vector Filter Functions
- * Predefined partitioning functions for vector
- * data. Examples on how to use them are shown in \ref
- * PartitioningData.
- * @{
- */
+/**
+   @name Predefined Vector Filter Functions
+   Predefined partitioning functions for vector
+   data. Examples on how to use them are shown in \ref
+   PartitioningData.
+   @{
+*/
 
 /**
    Return in \p child_interface the \p id th element of the vector
    represented by \p father_interface once partitioned in \p nparts chunks of
    equal size.
- */
+*/
 void starpu_vector_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /**
@@ -411,39 +418,40 @@ void starpu_vector_filter_block_shadow(void *father_interface, void *child_inter
    <c>filter_arg_ptr</c> field must point to an array of \p nparts long
    elements, each of which specifies the number of elements in each chunk
    of the partition.
- */
+*/
 void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /**
-   Return in \p child_interface the \p id th element of the vector
-   represented by \p father_interface once partitioned into \p nparts chunks
-   according to the <c>filter_arg_ptr</c> field of \p f. The
-   <c>filter_arg_ptr</c> field must point to an array of \p nparts uint32_t
-   elements, each of which specifies the number of elements in each chunk
-   of the partition.
- */
+  Return in \p child_interface the \p id th element of the vector
+  represented by \p father_interface once partitioned into \p nparts chunks
+  according to the <c>filter_arg_ptr</c> field of \p f. The
+  <c>filter_arg_ptr</c> field must point to an array of \p nparts uint32_t
+  elements, each of which specifies the number of elements in each chunk
+  of the partition.
+*/
 void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /**
    Return in \p child_interface the \p id th element of the vector
    represented by \p father_interface once partitioned in <c>2</c> chunks of
    equal size, ignoring nparts. Thus, \p id must be <c>0</c> or <c>1</c>.
- */
+*/
 void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /** @} */
 
-/** @name Predefined Block Filter Functions
- * Predefined partitioning functions for block data. Examples on how
- * to use them are shown in \ref PartitioningData. An example is
- * available in \c examples/filters/shadow3d.c
- * @{
- */
+/**
+   @name Predefined Block Filter Functions
+   Predefined partitioning functions for block data. Examples on how
+   to use them are shown in \ref PartitioningData. An example is
+   available in \c examples/filters/shadow3d.c
+   @{
+*/
 
 /**
-   Partition a block along the X dimension, thus getting
-   (x/\p nparts ,y,z) 3D matrices. If \p nparts does not divide x, the last
-   submatrix contains the remainder.
+  Partition a block along the X dimension, thus getting
+  (x/\p nparts ,y,z) 3D matrices. If \p nparts does not divide x, the last
+  submatrix contains the remainder.
  */
 void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
@@ -475,14 +483,14 @@ void starpu_block_filter_vertical_block(void *father_interface, void *child_inte
    <b>IMPORTANT</b>:
    This can only be used for read-only access, as no coherency is
    enforced for the shadowed parts.
- */
+*/
 void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /**
    Partition a block along the Z dimension, thus getting
    (x,y,z/\p nparts) blocks. If \p nparts does not divide z, the last
    submatrix contains the remainder.
- */
+*/
 void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /**
@@ -494,7 +502,7 @@ void starpu_block_filter_depth_block(void *father_interface, void *child_interfa
    <b>IMPORTANT</b>:
    This can only be used for read-only access, as no coherency is
    enforced for the shadowed parts.
- */
+*/
 void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /** @} */

+ 96 - 79
include/starpu_data_interfaces.h

@@ -19,53 +19,53 @@
 #ifndef __STARPU_DATA_INTERFACES_H__
 #define __STARPU_DATA_INTERFACES_H__
 
-/** @defgroup API_Data_Interfaces Data Interfaces
-
-    @brief Data management is done at a high-level in StarPU: rather than
-    accessing a mere list of contiguous buffers, the tasks may manipulate
-    data that are described by a high-level construct which we call data
-    interface.
-
-    An example of data interface is the "vector" interface which describes
-    a contiguous data array on a spefic memory node. This interface is a
-    simple structure containing the number of elements in the array, the
-    size of the elements, and the address of the array in the appropriate
-    address space (this address may be invalid if there is no valid copy
-    of the array in the memory node). More informations on the data
-    interfaces provided by StarPU are given in \ref API_Data_Interfaces.
-
-    When a piece of data managed by StarPU is used by a task, the task
-    implementation is given a pointer to an interface describing a valid
-    copy of the data that is accessible from the current processing unit.
-
-    Every worker is associated to a memory node which is a logical
-    abstraction of the address space from which the processing unit gets
-    its data. For instance, the memory node associated to the different
-    CPU workers represents main memory (RAM), the memory node associated
-    to a GPU is DRAM embedded on the device. Every memory node is
-    identified by a logical index which is accessible from the
-    function starpu_worker_get_memory_node(). When registering a piece of
-    data to StarPU, the specified memory node indicates where the piece of
-    data initially resides (we also call this memory node the home node of
-    a piece of data).
-
-    In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
-    and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
-    numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
-
-    There are several ways to register a memory region so that it can be
-    managed by StarPU. StarPU provides data interfaces for vectors, 2D
-    matrices, 3D matrices as well as BCSR and CSR sparse matrices.
-
-    Each data interface is provided with a set of field access functions.
-    The ones using a <c>void *</c> parameter aimed to be used in codelet
-    implementations (see for example the code in
-    \ref VectorScalingUsingStarPUAPI).
-
-    Applications can provide their own interface as shown in \ref DefiningANewDataInterface.
-
-    @{
- */
+/**
+   @defgroup API_Data_Interfaces Data Interfaces
+   @brief Data management is done at a high-level in StarPU: rather than
+   accessing a mere list of contiguous buffers, the tasks may manipulate
+   data that are described by a high-level construct which we call data
+   interface.
+
+   An example of data interface is the "vector" interface which describes
+   a contiguous data array on a spefic memory node. This interface is a
+   simple structure containing the number of elements in the array, the
+   size of the elements, and the address of the array in the appropriate
+   address space (this address may be invalid if there is no valid copy
+   of the array in the memory node). More informations on the data
+   interfaces provided by StarPU are given in \ref API_Data_Interfaces.
+
+   When a piece of data managed by StarPU is used by a task, the task
+   implementation is given a pointer to an interface describing a valid
+   copy of the data that is accessible from the current processing unit.
+
+   Every worker is associated to a memory node which is a logical
+   abstraction of the address space from which the processing unit gets
+   its data. For instance, the memory node associated to the different
+   CPU workers represents main memory (RAM), the memory node associated
+   to a GPU is DRAM embedded on the device. Every memory node is
+   identified by a logical index which is accessible from the
+   function starpu_worker_get_memory_node(). When registering a piece of
+   data to StarPU, the specified memory node indicates where the piece of
+   data initially resides (we also call this memory node the home node of
+   a piece of data).
+
+   In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
+   and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
+   numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
+
+   There are several ways to register a memory region so that it can be
+   managed by StarPU. StarPU provides data interfaces for vectors, 2D
+   matrices, 3D matrices as well as BCSR and CSR sparse matrices.
+
+   Each data interface is provided with a set of field access functions.
+   The ones using a <c>void *</c> parameter aimed to be used in codelet
+   implementations (see for example the code in
+   \ref VectorScalingUsingStarPUAPI).
+
+   Applications can provide their own interface as shown in \ref DefiningANewDataInterface.
+
+   @{
+*/
 
 #include <starpu.h>
 
@@ -538,9 +538,11 @@ struct starpu_data_interface_ops
 	char *name;
 };
 
-/** @name Basic API
-    @{
-    */
+/**
+   @name Basic API
+   @{
+*/
+
 /**
    Register a piece of data into the handle located at the
    \p handleptr address. The \p data_interface buffer contains the initial
@@ -706,13 +708,16 @@ void starpu_malloc_on_node_set_default_flags(unsigned node, int flags);
 
 /** @} */
 
-/** @name Accessing Matrix Data Interfaces
-    @{
- */
+/**
+   @name Accessing Matrix Data Interfaces
+   @{
+*/
 
 extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
 
-/** Matrix interface for dense matrices */
+/**
+   Matrix interface for dense matrices
+*/
 struct starpu_matrix_interface
 {
 	enum starpu_data_interface_id id; /**< Identifier of the interface */
@@ -875,13 +880,16 @@ size_t starpu_matrix_get_allocsize(starpu_data_handle_t handle);
 
 /** @} */
 
-/** @name Accessing COO Data Interfaces
-    @{
- */
+/**
+   @name Accessing COO Data Interfaces
+   @{
+*/
 
 extern struct starpu_data_interface_ops starpu_interface_coo_ops;
 
-/** COO Matrices */
+/**
+   COO Matrices
+*/
 struct starpu_coo_interface
 {
 	enum starpu_data_interface_id id; /**< identifier of the interface */
@@ -964,15 +972,18 @@ void starpu_coo_data_register(starpu_data_handle_t *handleptr, int home_node, ui
 
 /** @} */
 
-/** @name Block Data Interface
-    @{
- */
+/**
+   @name Block Data Interface
+   @{
+*/
 
 extern struct starpu_data_interface_ops starpu_interface_block_ops;
 
 /* TODO: rename to 3dmatrix? */
 /* TODO: add allocsize support */
-/** Block interface for 3D dense blocks */
+/**
+   Block interface for 3D dense blocks
+*/
 struct starpu_block_interface
 {
 	enum starpu_data_interface_id id; /**< identifier of the interface */
@@ -1115,9 +1126,10 @@ designated by \p interface.
 
 /** @} */
 
-/** @name Vector Data Interface
-    @{
- */
+/**
+   @name Vector Data Interface
+   @{
+*/
 
 extern struct starpu_data_interface_ops starpu_interface_vector_ops;
 
@@ -1241,9 +1253,10 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
 
 /** @} */
 
-/** @name Variable Data Interface
-    @{
- */
+/**
+   @name Variable Data Interface
+   @{
+*/
 
 extern struct starpu_data_interface_ops starpu_interface_variable_ops;
 
@@ -1322,9 +1335,10 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 
 /** @} */
 
-/** @name Void Data Interface
-    @{
- */
+/**
+   @name Void Data Interface
+   @{
+*/
 
 extern struct starpu_data_interface_ops starpu_interface_void_ops;
 
@@ -1340,8 +1354,9 @@ void starpu_void_data_register(starpu_data_handle_t *handle);
 
 /** @} */
 
-/** @name CSR Data Interface
-    @{
+/**
+   @name CSR Data Interface
+   @{
  */
 
 extern struct starpu_data_interface_ops starpu_interface_csr_ops;
@@ -1473,9 +1488,10 @@ size_t starpu_csr_get_elemsize(starpu_data_handle_t handle);
 
 /** @} */
 
-/** @name BCSR Data Interface
-    @{
- */
+/**
+   @name BCSR Data Interface
+   @{
+*/
 
 extern struct starpu_data_interface_ops starpu_interface_bcsr_ops;
 
@@ -1677,9 +1693,10 @@ size_t starpu_bcsr_get_elemsize(starpu_data_handle_t handle);
 
 /** @} */
 
-/** @name Multiformat Data Interface
-    @{
- */
+/**
+   @name Multiformat Data Interface
+   @{
+*/
 
 /**
    Multiformat operations

+ 7 - 4
include/starpu_disk.h

@@ -20,14 +20,17 @@
 #ifndef __STARPU_DISK_H__
 #define __STARPU_DISK_H__
 
-/** @defgroup API_Out_Of_Core Out Of Core
-    @{
- */
+/**
+   @defgroup API_Out_Of_Core Out Of Core
+   @{
+*/
 
 #include <sys/types.h>
 #include <starpu_config.h>
 
-/** Set of functions to manipulate datas on disk. */
+/**
+   Set of functions to manipulate datas on disk.
+*/
 struct starpu_disk_ops
 {
 	/**

+ 7 - 5
include/starpu_driver.h

@@ -18,10 +18,10 @@
 #ifndef __STARPU_DRIVER_H__
 #define __STARPU_DRIVER_H__
 
-/** @defgroup API_Running_Drivers Running Drivers
- *
- * @{
- */
+/**
+   @defgroup API_Running_Drivers Running Drivers
+   @{
+*/
 
 #include <starpu_config.h>
 #if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
@@ -33,7 +33,9 @@ extern "C"
 {
 #endif
 
-/** structure for a driver */
+/**
+   structure for a driver
+*/
 struct starpu_driver
 {
 	/**

+ 4 - 4
include/starpu_expert.h

@@ -18,10 +18,10 @@
 #ifndef __STARPU_EXPERT_H__
 #define __STARPU_EXPERT_H__
 
-/** @defgroup API_Expert_Mode Expert Mode
- *
- * @{
- */
+/**
+   @defgroup API_Expert_Mode Expert Mode
+   @{
+*/
 
 #ifdef __cplusplus
 extern "C"

+ 4 - 4
include/starpu_fxt.h

@@ -21,10 +21,10 @@
 #ifndef __STARPU_FXT_H__
 #define __STARPU_FXT_H__
 
-/** @defgroup API_FxT_Support FxT Support
- *
- * @{
- */
+/**
+   @defgroup API_FxT_Support FxT Support
+   @{
+*/
 
 #include <starpu_perfmodel.h>
 

+ 4 - 4
include/starpu_hash.h

@@ -19,10 +19,10 @@
 #ifndef __STARPU_HASH_H__
 #define __STARPU_HASH_H__
 
-/** @ingroup API_Data_Interfaces
- *
- * @{
- */
+/**
+   @ingroup API_Data_Interfaces
+   @{
+*/
 
 #include <stdint.h>
 #include <stddef.h>

+ 4 - 4
include/starpu_mic.h

@@ -19,10 +19,10 @@
 #ifndef __STARPU_MIC_H__
 #define __STARPU_MIC_H__
 
-/** @defgroup API_MIC_Extensions MIC Extensions
- *
- * @{
- */
+/**
+   @defgroup API_MIC_Extensions MIC Extensions
+   @{
+*/
 
 #include <starpu_config.h>
 

+ 4 - 4
include/starpu_mpi_ms.h

@@ -18,10 +18,10 @@
 #ifndef __STARPU_MPI_MS_H__
 #define __STARPU_MPI_MS_H__
 
-/** @defgroup API_Master_Slave Master Slave Extension
- *
- * @{
- */
+/**
+   @defgroup API_Master_Slave Master Slave Extension
+   @{
+*/
 
 #include <starpu_config.h>
 

+ 32 - 27
include/starpu_opencl.h

@@ -19,10 +19,10 @@
 #ifndef __STARPU_OPENCL_H__
 #define __STARPU_OPENCL_H__
 
-/** @defgroup API_OpenCL_Extensions OpenCL Extensions
- *
- * @{
- */
+/**
+   @defgroup API_OpenCL_Extensions OpenCL Extensions
+   @{
+*/
 
 #include <starpu_config.h>
 #ifdef STARPU_USE_OPENCL
@@ -51,9 +51,10 @@ struct starpu_opencl_program
 	cl_program programs[STARPU_MAXOPENCLDEVS];
 };
 
-/** @name Writing OpenCL kernels
-    @{
- */
+/**
+   @name Writing OpenCL kernels
+   @{
+*/
 
 /**
    Return the OpenCL context of the device designated by \p devid
@@ -105,17 +106,18 @@ int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
 
 /** @} */
 
-/** @name Compiling OpenCL kernels
-    Source codes for OpenCL kernels can be stored in a file or in a
-    string. StarPU provides functions to build the program executable for
-    each available OpenCL device as a cl_program object. This program
-    executable can then be loaded within a specific queue as explained in
-    the next section. These are only helpers, Applications can also fill a
-    starpu_opencl_program array by hand for more advanced use (e.g.
-    different programs on the different OpenCL devices, for relocation
-    purpose for instance).
-    @{
- */
+/**
+   @name Compiling OpenCL kernels
+   Source codes for OpenCL kernels can be stored in a file or in a
+   string. StarPU provides functions to build the program executable for
+   each available OpenCL device as a cl_program object. This program
+   executable can then be loaded within a specific queue as explained in
+   the next section. These are only helpers, Applications can also fill a
+   starpu_opencl_program array by hand for more advanced use (e.g.
+   different programs on the different OpenCL devices, for relocation
+   purpose for instance).
+   @{
+*/
 
 /**
    Store the contents of the file \p source_file_name in the buffer
@@ -182,9 +184,10 @@ int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
 
 /** @} */
 
-/** @name Loading OpenCL kernels
-    @{
- */
+/**
+   @name Loading OpenCL kernels
+   @{
+*/
 
 /**
    Create a kernel \p kernel for device \p devid, on its computation
@@ -200,9 +203,10 @@ int starpu_opencl_release_kernel(cl_kernel kernel);
 
 /** @} */
 
-/** @name OpenCL Statistics
-    @{
- */
+/**
+   @name OpenCL Statistics
+   @{
+*/
 
 /**
    Collect statistics on a kernel execution.
@@ -215,9 +219,10 @@ int starpu_opencl_collect_stats(cl_event event);
 
 /** @} */
 
-/** @name OpenCL Utilities
-    @{
- */
+/**
+   @name OpenCL Utilities
+   @{
+*/
 
 /**
    Return the error message in English corresponding to \p status, an OpenCL

+ 28 - 21
include/starpu_openmp.h

@@ -18,10 +18,11 @@
 #ifndef __STARPU_OPENMP_H__
 #define __STARPU_OPENMP_H__
 
-/** @defgroup API_OpenMP_Runtime_Support OpenMP Runtime Support
-    @brief This section describes the interface provided for implementing OpenMP runtimes on top of StarPU.
-    @{
- */
+/**
+   @defgroup API_OpenMP_Runtime_Support OpenMP Runtime Support
+   @brief This section describes the interface provided for implementing OpenMP runtimes on top of StarPU.
+   @{
+*/
 
 #include <starpu_config.h>
 
@@ -213,8 +214,9 @@ extern "C"
 #define __STARPU_OMP_NOTHROW __attribute__((__nothrow__))
 #endif
 
-/** @name Initialisation
-    @{
+/**
+   @name Initialisation
+   @{
 */
 
 /**
@@ -228,9 +230,10 @@ extern void starpu_omp_shutdown(void) __STARPU_OMP_NOTHROW;
 
 /** @} */
 
-/** @name Parallel
-    \anchor ORS_Parallel
-    @{
+/**
+   @name Parallel
+   \anchor ORS_Parallel
+   @{
 */
 
 /**
@@ -266,9 +269,10 @@ extern int starpu_omp_master_inline(void) __STARPU_OMP_NOTHROW;
 
 /** @} */
 
-/** @name Synchronization
-    \anchor ORS_Synchronization
-    @{
+/**
+   @name Synchronization
+   \anchor ORS_Synchronization
+   @{
 */
 
 /**
@@ -317,9 +321,10 @@ extern void starpu_omp_critical_inline_end(const char *name) __STARPU_OMP_NOTHRO
 
 /** @} */
 
-/** @name Worksharing
-    \anchor ORS_Worksharing
-    @{
+/**
+   @name Worksharing
+   \anchor ORS_Worksharing
+   @{
 */
 
 /**
@@ -542,9 +547,10 @@ extern void starpu_omp_sections_combined(unsigned long long nb_sections, void (*
 
 /** @} */
 
-/** @name Task
-    \anchor ORS_Task
-    @{
+/**
+   @name Task
+   \anchor ORS_Task
+   @{
 */
 
 /**
@@ -604,9 +610,10 @@ extern void starpu_omp_taskloop_inline_end(const struct starpu_omp_task_region_a
 
 /** @} */
 
-/** @name API
-    \anchor ORS_API
-    @{
+/**
+   @name API
+   \anchor ORS_API
+   @{
 */
 
 /**

+ 264 - 46
include/starpu_perfmodel.h

@@ -21,10 +21,10 @@
 #ifndef __STARPU_PERFMODEL_H__
 #define __STARPU_PERFMODEL_H__
 
-/** @defgroup
- *
- * @{
- */
+/**
+   @defgroup API_Performance_Model Performance Model
+   @{
+*/
 
 #include <starpu.h>
 #include <stdio.h>
@@ -39,31 +39,37 @@ struct starpu_data_descr;
 
 #define STARPU_NARCH STARPU_ANY_WORKER
 
+/**
+   todo
+*/
 struct starpu_perfmodel_device
 {
-	enum starpu_worker_archtype type;
-	int devid;
-	int ncores;
+	enum starpu_worker_archtype type; /**< type of the device */
+	int devid;                        /**< identifier of the precise device */
+	int ncores;                       /**< number of execution in parallel, minus 1 */
 };
 
+/**
+   todo
+*/
 struct starpu_perfmodel_arch
 {
-	int ndevices;
-	struct starpu_perfmodel_device *devices;
+	int ndevices;                            /**< number of the devices for the given arch */
+	struct starpu_perfmodel_device *devices; /**< list of the devices for the given arch */
 };
 
 
 struct starpu_perfmodel_history_entry
 {
-	double mean;
-	double deviation;
-	double sum;
-	double sum2;
-	unsigned nsample;
+	double mean;        /**< mean_n = 1/n sum */
+	double deviation;   /**< n dev_n = sum2 - 1/n (sum)^2 */
+	double sum;         /**< sum of samples (in µs) */
+	double sum2;        /**< sum of samples^2 */
+	unsigned nsample;   /**< number of samples */
 	unsigned nerror;
-	uint32_t footprint;
-	size_t size;
-	double flops;
+	uint32_t footprint; /**< data footprint */
+	size_t size;        /**< in bytes */
+	double flops;       /**< Provided by the application */
 
 	double duration;
 	starpu_tag_t tag;
@@ -76,30 +82,35 @@ struct starpu_perfmodel_history_list
 	struct starpu_perfmodel_history_entry *entry;
 };
 
+/**
+   todo
+*/
 struct starpu_perfmodel_regression_model
 {
-	double sumlny;
+	double sumlny;          /**< sum of ln(measured) */
 
-	double sumlnx;
-	double sumlnx2;
+	double sumlnx;          /**< sum of ln(size) */
+	double sumlnx2;         /**< sum of ln(size)^2 */
 
-	unsigned long minx;
-	unsigned long maxx;
+	unsigned long minx;     /**< minimum size */
+	unsigned long maxx;     /**< maximum size */
 
-	double sumlnxlny;
+	double sumlnxlny;       /**< sum of ln(size)*ln(measured) */
 
-	double alpha;
-	double beta;
-	unsigned valid;
+	double alpha;           /**< estimated = alpha * size ^ beta */
+	double beta;            /**< estimated = alpha * size ^ beta */
+	unsigned valid;         /**< whether the linear regression model is valid (i.e. enough measures) */
 
-	double a, b, c;
-	unsigned nl_valid;
+	double a;               /**< estimated = a size ^b + c */
+	double b;               /**< estimated = a size ^b + c */
+	double c;               /**< estimated = a size ^b + c */
+	unsigned nl_valid;      /**< whether the non-linear regression model is valid (i.e. enough measures) */
 
-	unsigned nsample;
+	unsigned nsample;       /**< number of sample values for non-linear regression */
 
-	double *coeff;
-	unsigned ncoeff;
-	unsigned multi_valid;
+	double *coeff;          /**< list of computed coefficients for multiple linear regression model */
+	unsigned ncoeff;        /**< number of coefficients for multiple linear regression model */
+	unsigned multi_valid;   /**< whether the multiple linear regression model is valid */
 };
 
 struct starpu_perfmodel_history_table;
@@ -109,66 +120,224 @@ struct starpu_perfmodel_history_table;
 typedef double (*starpu_perfmodel_per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 typedef size_t (*starpu_perfmodel_per_arch_size_base)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
+/**
+   information about the performance model of a given arch.
+*/
 struct starpu_perfmodel_per_arch
 {
+	/**
+	   Used by ::STARPU_PER_ARCH, must point to functions which take a
+	   task, the target arch and implementation number (as mere
+	   conveniency, since the array is already indexed by these), and
+	   must return a task duration estimation in micro-seconds.
+	*/
 	starpu_perfmodel_per_arch_cost_function cost_function;
+	/**
+	   Same as in structure starpu_perfmodel, but per-arch, in case it
+	   depends on the architecture-specific implementation.
+	*/
 	starpu_perfmodel_per_arch_size_base size_base;
 
+	/**
+	   \private
+	   The history of performance measurements.
+	*/
 	struct starpu_perfmodel_history_table *history;
+	/**
+	   \private
+	   Used by ::STARPU_HISTORY_BASED, ::STARPU_NL_REGRESSION_BASED and
+	   ::STARPU_MULTIPLE_REGRESSION_BASED, records all execution history
+	   measures.
+	*/
 	struct starpu_perfmodel_history_list *list;
+	/**
+	   \private
+	   Used by ::STARPU_REGRESSION_BASED, ::STARPU_NL_REGRESSION_BASED
+	   and ::STARPU_MULTIPLE_REGRESSION_BASED, contains the estimated
+	   factors of the regression.
+	*/
 	struct starpu_perfmodel_regression_model regression;
 
 	char debug_path[256];
 };
 
+/**
+   todo
+*/
 enum starpu_perfmodel_type
 {
         STARPU_PERFMODEL_INVALID=0,
-	STARPU_PER_ARCH,
-	STARPU_COMMON,
-	STARPU_HISTORY_BASED,
-	STARPU_REGRESSION_BASED,
-	STARPU_NL_REGRESSION_BASED,
-	STARPU_MULTIPLE_REGRESSION_BASED
+	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
+	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
+	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
+	STARPU_REGRESSION_BASED,          /**< Automatic linear regression-based cost model  (alpha * size ^ beta) */
+	STARPU_NL_REGRESSION_BASED,       /**< Automatic non-linear regression-based cost model (a * size ^ b + c) */
+	STARPU_MULTIPLE_REGRESSION_BASED  /**< Automatic multiple linear regression-based cost model. Application
+					     provides parameters, their combinations and exponents. */
 };
 
 struct _starpu_perfmodel_state;
 typedef struct _starpu_perfmodel_state* starpu_perfmodel_state_t;
 
+/**
+   Contain all information about a performance model. At least the
+   type and symbol fields have to be filled when defining a performance
+   model for a codelet. For compatibility, make sure to initialize the
+   whole structure to zero, either by using explicit memset, or by
+   letting the compiler implicitly do it in e.g. static storage case. If
+   not provided, other fields have to be zero.
+*/
 struct starpu_perfmodel
 {
+	/**
+	   type of performance model
+	   <ul>
+	   <li>
+	   ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED,
+	   ::STARPU_NL_REGRESSION_BASED: No other fields needs to be
+	   provided, this is purely history-based.
+	   </li>
+	   <li>
+	   ::STARPU_MULTIPLE_REGRESSION_BASED: Need to provide fields
+	   starpu_perfmodel::nparameters (number of different parameters),
+	   starpu_perfmodel::ncombinations (number of parameters
+	   combinations-tuples) and table starpu_perfmodel::combinations
+	   which defines exponents of the equation. Function cl_perf_func
+	   also needs to define how to extract parameters from the task.
+	   </li>
+	   <li>
+	   ::STARPU_PER_ARCH: either field
+	   starpu_perfmodel::arch_cost_function has to be filled with a
+	   function that returns the cost in micro-seconds on the arch given
+	   as parameter, or field starpu_perfmodel::per_arch has to be filled
+	   with functions which return the cost in micro-seconds.
+	   </li>
+	   <li>
+	   ::STARPU_COMMON: field starpu_perfmodel::cost_function has to be
+	   filled with a function that returns the cost in micro-seconds on a
+	   CPU, timing on other archs will be determined by multiplying by an
+	   arch-specific factor.
+	   </li>
+	   </ul>
+	*/
 	enum starpu_perfmodel_type type;
 
+	/**
+	   Used by ::STARPU_COMMON. Take a task and implementation number,
+	   and must return a task duration estimation in micro-seconds.
+	*/
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
+	/**
+	   Used by ::STARPU_COMMON. Take a task, an arch and implementation
+	   number, and must return a task duration estimation in
+	   micro-seconds on that arch.
+	*/
 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
 
+	/**
+	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
+	   ::STARPU_NL_REGRESSION_BASED. If not <c>NULL</c>, take a task and
+	   implementation number, and return the size to be used as index to
+	   distinguish histories and as a base for regressions.
+	*/
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
+	/**
+	   Used by ::STARPU_HISTORY_BASED. If not <c>NULL</c>, take a task
+	   and return the footprint to be used as index to distinguish
+	   histories. The default is to use the starpu_task_data_footprint()
+	   function.
+	*/
 	uint32_t (*footprint)(struct starpu_task *);
 
+	/**
+	   symbol name for the performance model, which will be used as file
+	   name to store the model. It must be set otherwise the model will
+	   be ignored.
+	*/
 	const char *symbol;
 
+	/**
+	   \private
+	   Whether the performance model is already loaded from the disk.
+	*/
 	unsigned is_loaded;
+	/**
+	   \private
+	*/
 	unsigned benchmarking;
+	/**
+	   \private
+	*/
 	unsigned is_init;
 
 	void (*parameters)(struct starpu_task * task, double *parameters);
+	/**
+	   \private
+	   Names of parameters used for multiple linear regression models (M,
+	   N, K)
+	*/
 	const char **parameters_names;
+	/**
+	   \private
+	   Number of parameters used for multiple linear regression models
+	*/
 	unsigned nparameters;
+	/**
+	   \private
+	   Table of combinations of parameters (and the exponents) used for
+	   multiple linear regression models
+	*/
 	unsigned **combinations;
+	/**
+	   \private
+	   Number of combination of parameters used for multiple linear
+	   regression models
+	*/
 	unsigned ncombinations;
-
+	/**
+	   \private
+	*/
 	starpu_perfmodel_state_t state;
 };
 
 void starpu_perfmodel_init(struct starpu_perfmodel *model);
+
+/**
+   Load the performance model found in the file named \p filename. \p model has to be
+   completely zero, and will be filled with the information stored in the given file.
+*/
 int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *model);
+
+/**
+   Load a given performance model. \p model has to be
+   completely zero, and will be filled with the information stored in
+   <c>$STARPU_HOME/.starpu</c>. The function is intended to be used by
+   external tools that want to read the performance model files.
+*/
+
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
+
+/**
+   Unload \p model which has been previously loaded
+   through the function starpu_perfmodel_load_symbol()
+*/
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
+
 void starpu_perfmodel_get_model_path(const char *symbol, char *path, size_t maxlen);
 
+/**
+   Free internal memory used for sampling directory
+   management. It should only be called by an application which is not
+   calling starpu_shutdown() as this function already calls it. See for
+   example <c>tools/starpu_perfmodel_display.c</c>.
+*/
 void starpu_perfmodel_free_sampling_directories(void);
 
+/**
+   Return the architecture type of the worker \p workerid.
+*/
 struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id);
+
 int starpu_perfmodel_get_narch_combs();
 int starpu_perfmodel_arch_comb_add(int ndevices, struct starpu_perfmodel_device* devices);
 int starpu_perfmodel_arch_comb_get(int ndevices, struct starpu_perfmodel_device *devices);
@@ -180,39 +349,88 @@ struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_devices(struct
 int starpu_perfmodel_set_per_devices_cost_function(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_cost_function func, ...);
 int starpu_perfmodel_set_per_devices_size_base(struct starpu_perfmodel *model, int impl, starpu_perfmodel_per_arch_size_base func, ...);
 
+/**
+   Return the path to the debugging information for the performance model.
+*/
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
+
 char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
+
+/**
+   Return the architecture name for \p arch
+*/
 void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl);
 
+/**
+   Return the estimated time of a task with the given model and the given footprint.
+*/
 double starpu_perfmodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint);
+
+/**
+   If starpu_init() is not used, starpu_perfmodel_initialize() should be used called calling starpu_perfmodel_* functions.
+*/
 void starpu_perfmodel_initialize(void);
+
+/**
+   Print a list of all performance models on \p output
+*/
 int starpu_perfmodel_list(FILE *output);
+
 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t footprint, FILE *output);
 
 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model);
 
+/**
+   Feed the performance model model with an explicit
+   measurement measured (in µs), in addition to measurements done by StarPU
+   itself. This can be useful when the application already has an
+   existing set of measurements done in good conditions, that StarPU
+   could benefit from instead of doing on-line measurements. An example
+   of use can be seen in \ref PerformanceModelExample.
+*/
 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
+
+/**
+   Print the directory name storing performance models on \p output
+*/
 void starpu_perfmodel_directory(FILE *output);
 
+/**
+   Print a matrix of bus bandwidths on \p f.
+*/
 void starpu_bus_print_bandwidth(FILE *f);
+
+/**
+   Print the affinity devices on \p f.
+*/
 void starpu_bus_print_affinity(FILE *f);
+
+/**
+   Print on \p f the name of the files containing the matrix of bus bandwidths, the affinity devices and the latency.
+*/
 void starpu_bus_print_filenames(FILE *f);
 
+/**
+   Return the bandwidth of data transfer between two memory nodes
+*/
 double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
+
+/**
+   Return the latency of data transfer between two memory nodes
+*/
 double starpu_transfer_latency(unsigned src_node, unsigned dst_node);
-double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size);
 
-extern struct starpu_perfmodel starpu_perfmodel_nop;
+/**
+   Return the estimated time to transfer a given size between two memory nodes.
+*/
+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size);
 
 /**
-   Display statistics about the current data handles registered
-   within StarPU. StarPU must have been configured with the configure
-   option \ref enable-memory-stats "--enable-memory-stats" (see \ref
-   MemoryFeedback).
+   Performance model which just always return 1µs.
 */
-void starpu_data_display_memory_stats();
+extern struct starpu_perfmodel starpu_perfmodel_nop;
 
 #ifdef __cplusplus
 }

+ 125 - 4
include/starpu_profiling.h

@@ -19,10 +19,10 @@
 #ifndef __STARPU_PROFILING_H__
 #define __STARPU_PROFILING_H__
 
-/** @defgroup
- *
- * @{
- */
+/**
+   @defgroup API_Profiling Profiling
+   @{
+*/
 
 #include <starpu.h>
 #include <errno.h>
@@ -33,48 +33,89 @@ extern "C"
 {
 #endif
 
+/**
+   Used when calling the function starpu_profiling_status_set() to disable profiling.
+*/
 #define STARPU_PROFILING_DISABLE	0
+/**
+   Used when calling the function starpu_profiling_status_set() to enable profiling.
+*/
 #define STARPU_PROFILING_ENABLE		1
 
+/**
+   Information about the execution of a task. It is accessible from
+   the field starpu_task::profiling_info if profiling was enabled.
+ */
 struct starpu_profiling_task_info
 {
+	/** Date of task submission (relative to the initialization of StarPU). */
 	struct timespec submit_time;
 
+	/** Time when the task was submitted to the scheduler. */
 	struct timespec push_start_time;
+	/** Time when the scheduler finished with the task submission. */
 	struct timespec push_end_time;
+	/** Time when the scheduler started to be requested for a task, and eventually gave that task. */
 	struct timespec pop_start_time;
+	/** Time when the scheduler finished providing the task for execution. */
 	struct timespec pop_end_time;
 
+	/** Time when the worker started fetching input data. */
 	struct timespec acquire_data_start_time;
+	/** Time when the worker finished fetching input data. */
 	struct timespec acquire_data_end_time;
 
+	/** Date of task execution beginning (relative to the initialization of StarPU). */
 	struct timespec start_time;
+	/** Date of task execution termination (relative to the initialization of StarPU). */
 	struct timespec end_time;
 
+	/** Time when the worker started releasing data. */
 	struct timespec release_data_start_time;
+	/** Time when the worker finished releasing data. */
 	struct timespec release_data_end_time;
 
+	/** Time when the worker started the application callback for the task. */
 	struct timespec callback_start_time;
+	/** Time when the worker finished the application callback for the task. */
 	struct timespec callback_end_time;
 
 	/* TODO add expected length, expected start/end ? */
+
+	/** Identifier of the worker which has executed the task. */
 	int workerid;
 
+	/** Number of cycles used by the task, only available in the MoviSim */
 	uint64_t used_cycles;
+	/** Number of cycles stalled within the task, only available in the MoviSim */
 	uint64_t stall_cycles;
+	/** Energy consumed by the task, in Joules */
 	double energy_consumed;
 };
 
+/**
+   Profiling information associated to a worker. The timing is
+   provided since the previous call to
+   starpu_profiling_worker_get_info()
+*/
 struct starpu_profiling_worker_info
 {
+	/** Starting date for the reported profiling measurements. */
 	struct timespec start_time;
+	/** Duration of the profiling measurement interval. */
 	struct timespec total_time;
+	/** Time spent by the worker to execute tasks during the profiling measurement interval. */
 	struct timespec executing_time;
+	/** Time spent idling by the worker during the profiling measurement interval. */
 	struct timespec sleeping_time;
+	/** Number of tasks executed by the worker during the profiling measurement interval. */
 	int executed_tasks;
 
+	/** Number of cycles used by the worker, only available in the MoviSim */
 	uint64_t used_cycles;
+	/** Number of cycles stalled within the worker, only available in the MoviSim */
 	uint64_t stall_cycles;
+	/** Energy consumed by the worker, in Joules */
 	double energy_consumed;
 
 	double flops;
@@ -82,15 +123,43 @@ struct starpu_profiling_worker_info
 
 struct starpu_profiling_bus_info
 {
+	/** Time of bus profiling startup. */
 	struct timespec start_time;
+	/** Total time of bus profiling. */
 	struct timespec total_time;
+	/** Number of bytes transferred during profiling. */
 	int long long transferred_bytes;
+	/** Number of transfers during profiling. */
 	int transfer_count;
 };
 
+/**
+   Reset performance counters and enable profiling if the
+   environment variable \ref STARPU_PROFILING is set to a positive value.
+*/
 void starpu_profiling_init(void);
+
+/**
+   Set the ID used for profiling trace filename. Has to be called before starpu_init().
+*/
 void starpu_profiling_set_id(int new_id);
+
+/**
+   Set the profiling status. Profiling is activated
+   by passing \ref STARPU_PROFILING_ENABLE in \p status. Passing
+   \ref STARPU_PROFILING_DISABLE disables profiling. Calling this function
+   resets all profiling measurements. When profiling is enabled, the
+   field starpu_task::profiling_info points to a valid structure
+   starpu_profiling_task_info containing information about the execution
+   of the task. Negative return values indicate an error, otherwise the
+   previous status is returned.
+*/
 int starpu_profiling_status_set(int status);
+
+/**
+   Return the current profiling status or a negative value in case
+   there was an error.
+*/
 int starpu_profiling_status_get(void);
 
 #ifdef BUILDING_STARPU
@@ -107,17 +176,43 @@ extern int _starpu_profiling;
 #endif
 #endif
 
+/**
+   Get the profiling info associated to the worker identified by
+   \p workerid, and reset the profiling measurements. If the argument \p
+   worker_info is <c>NULL</c>, only reset the counters associated to worker
+   \p workerid. Upon successful completion, this function returns 0.
+   Otherwise, a negative value is returned.
+*/
 int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info);
 
+/**
+   Return the number of buses in the machine
+*/
 int starpu_bus_get_count(void);
+
+/**
+   Return the identifier of the bus between \p src and \p dst
+*/
 int starpu_bus_get_id(int src, int dst);
+
+/**
+   Return the source point of bus \p busid
+*/
 int starpu_bus_get_src(int busid);
+
+/**
+   Return the destination point of bus \p busid
+*/
 int starpu_bus_get_dst(int busid);
 void starpu_bus_set_direct(int busid, int direct);
 int starpu_bus_get_direct(int busid);
 void starpu_bus_set_ngpus(int busid, int ngpus);
 int starpu_bus_get_ngpus(int busid);
 
+/**
+   See _starpu_profiling_bus_helper_display_summary in src/profiling/profiling_helpers.c for a usage example.
+   Note that calling starpu_bus_get_profiling_info() resets the counters to zero.
+*/
 int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info);
 
 /* Some helper functions to manipulate profiling API output */
@@ -177,12 +272,38 @@ static __starpu_inline void starpu_timespec_sub(const struct timespec *a,
 #define starpu_timespec_cmp(a, b, CMP)                          \
 	(((a)->tv_sec == (b)->tv_sec) ? ((a)->tv_nsec CMP (b)->tv_nsec) : ((a)->tv_sec CMP (b)->tv_sec))
 
+/**
+   Return the time elapsed between \p start and \p end in microseconds.
+*/
 double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end);
+
+/**
+   Convert the given timespec \p ts into microseconds
+*/
 double starpu_timing_timespec_to_us(struct timespec *ts);
 
+/**
+   Display statistics about the bus on \c stderr. if the environment
+   variable \ref STARPU_BUS_STATS is defined. The function is called
+   automatically by starpu_shutdown().
+*/
 void starpu_profiling_bus_helper_display_summary(void);
+
+/**
+   Display statistic about the workers on \c stderr if the
+   environment variable \ref STARPU_WORKER_STATS is defined. The function is
+   called automatically by starpu_shutdown().
+*/
 void starpu_profiling_worker_helper_display_summary(void);
 
+/**
+   Display statistics about the current data handles registered
+   within StarPU. StarPU must have been configured with the configure
+   option \ref enable-memory-stats "--enable-memory-stats" (see \ref
+   MemoryFeedback).
+*/
+void starpu_data_display_memory_stats();
+
 #ifdef __cplusplus
 }
 #endif

+ 3 - 3
include/starpu_rand.h

@@ -19,9 +19,9 @@
 #ifndef __STARPU_RAND_H__
 #define __STARPU_RAND_H__
 
-/** @defgroup
- *
- * @{
+/**
+   @defgroup API_Random_Functions Random Functions
+   @{
  */
 
 #include <stdlib.h>

+ 17 - 4
include/starpu_scc.h

@@ -19,19 +19,32 @@
 #ifndef __STARPU_SCC_H__
 #define __STARPU_SCC_H__
 
-/** @defgroup
- *
- * @{
- */
+/**
+   @defgroup API_SCC_Extensions SCC Extensions
+   @{
+*/
 
 #include <starpu_config.h>
 
 #ifdef STARPU_USE_SCC
 
+/**
+   Type for SCC function symbols
+*/
 typedef void *starpu_scc_func_symbol_t;
 
+/**
+   Initiate a lookup on each SCC device to find the adress of the
+   function named \p func_name, store them in the global array kernels
+   and return the index in the array through \p symbol.
+*/
 int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
 
+/**
+   If success, return the pointer to the function defined by \p symbol on
+   the device linked to the called device. This can for instance be used
+   in a starpu_scc_func_symbol_t implementation.
+*/
 starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t symbol);
 
 #endif /* STARPU_USE_SCC */

+ 506 - 10
include/starpu_sched_component.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2017                                     Arthur Chevalier
  * Copyright (C) 2013,2014,2017                           Inria
- * Copyright (C) 2014,2015,2017,2019                           CNRS
+ * Copyright (C) 2014,2015,2017,2019                      CNRS
  * Copyright (C) 2014-2019                                Université de Bordeaux
  * Copyright (C) 2013                                     Simon Archipoff
  *
@@ -21,10 +21,10 @@
 #ifndef __STARPU_SCHED_COMPONENT_H__
 #define __STARPU_SCHED_COMPONENT_H__
 
-/** @defgroup
- *
- * @{
- */
+/**
+   @defgroup API_Modularized_Scheduler Modularized Scheduler Interface
+   @{
+*/
 
 #include <starpu.h>
 
@@ -37,106 +37,379 @@ extern "C"
 {
 #endif
 
+/**
+   flags for starpu_sched_component::properties
+*/
 enum starpu_sched_component_properties
 {
+	/** indicate that all workers have the same starpu_worker_archtype */
 	STARPU_SCHED_COMPONENT_HOMOGENEOUS = (1<<0),
+	/** indicate that all workers have the same memory component */
 	STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE = (1<<1)
 };
 
+/**
+   indicate if component is homogeneous
+*/
 #define STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS(component) ((component)->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS)
+
+/**
+   indicate if all workers have the same memory component
+*/
 #define STARPU_SCHED_COMPONENT_IS_SINGLE_MEMORY_NODE(component) ((component)->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
 
+/**
+   Structure for a scheduler module.  A scheduler is a
+   tree-like structure of them, some parts of scheduler can be shared by
+   several contexes to perform some local optimisations, so, for all
+   components, a list of parent is defined by \c sched_ctx_id. They
+   embed there specialised method in a pseudo object-style, so calls are
+   like <c>component->push_task(component,task)</c>
+*/
 struct starpu_sched_component
 {
+	/** The tree containing the component*/
 	struct starpu_sched_tree *tree;
+	/** set of underlying workers */
 	struct starpu_bitmap *workers;
+	/**
+	   subset of starpu_sched_component::workers that is currently available in the context
+	   The push method should take this value into account, it is set with:
+	   component->workers UNION tree->workers UNION
+	   component->child[i]->workers_in_ctx iff exist x such as component->children[i]->parents[x] == component
+	*/
 	struct starpu_bitmap *workers_in_ctx;
+	/** private data */
 	void *data;
 	char *name;
+	/** number of compoments's children */
 	unsigned nchildren;
+	/** vector of component's children */
 	struct starpu_sched_component **children;
+	/** number of component's parents */
 	unsigned nparents;
+	/** vector of component's parents */
 	struct starpu_sched_component **parents;
 
+	/** add a child to component */
 	void (*add_child)(struct starpu_sched_component *component, struct starpu_sched_component *child);
+	/** remove a child from component */
 	void (*remove_child)(struct starpu_sched_component *component, struct starpu_sched_component *child);
 	void (*add_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent);
 	void (*remove_parent)(struct starpu_sched_component *component, struct starpu_sched_component *parent);
 
+	/**
+	   push a task in the scheduler module. this function is called to
+	   push a task on component subtree, this can either perform a
+	   recursive call on a child or store the task in the component,
+	   then it will be returned by a further pull_task call.
+	   the caller must ensure that component is able to execute task.
+	   This method must either return 0 if it the task was properly stored or
+	   passed over to a child component, or return a value different from 0 if the
+	   task could not be consumed (e.g. the queue is full).
+	*/
 	int (*push_task)(struct starpu_sched_component *, struct starpu_task *);
+
+	/**
+	   pop a task from the scheduler module. this function is called by workers to get a task from their
+	   parents. this function should first return a locally stored task
+	   or perform a recursive call on the parents.
+	   the task returned by this function should be executable by the caller
+	*/
 	struct starpu_task *(*pull_task)(struct starpu_sched_component *from, struct starpu_sched_component *to);
 
+	/**
+	   This function is called by a component which implements a queue,
+	   allowing it to signify to its parents that an empty slot is
+	   available in its queue. This should return 1 if some tasks could be pushed
+	   The basic implementation of this function
+	   is a recursive call to its parents, the user has to specify a
+	   personally-made function to catch those calls.
+	*/
 	int (*can_push)(struct starpu_sched_component *from, struct starpu_sched_component *to);
+
+	/**
+	   This function allow a component to wake up a worker. It is
+	   currently called by component which implements a queue, to
+	   signify to its children that a task have been pushed in its local
+	   queue, and is available to be popped by a worker, for example.
+	   This should return 1 if some some container or worker could (or will) pull
+	   some tasks.
+	   The basic implementation of this function is a recursive call to
+	   its children, until at least one worker have been woken up.
+	*/
 	int (*can_pull)(struct starpu_sched_component *component);
 
 	int (*notify)(struct starpu_sched_component* component, int message_ID, void* arg);
 
+	/**
+	   heuristic to compute load of scheduler module. Basically the number of tasks divided by the sum
+	   of relatives speedup of workers available in context.
+	   estimated_load(component) = sum(estimated_load(component_children)) + nb_local_tasks / average(relative_speedup(underlying_worker))
+	*/
 	double (*estimated_load)(struct starpu_sched_component *component);
+	/**
+	   return the time when a worker will enter in starvation. This function is relevant only if the task->predicted
+	   member has been set.
+	*/
 	double (*estimated_end)(struct starpu_sched_component *component);
 
+	/**
+	   called by starpu_sched_component_destroy. Should free data allocated during creation
+	*/
 	void (*deinit_data)(struct starpu_sched_component *component);
+
+	/**
+	   this function is called for each component when workers are added or removed from a context
+	*/
 	void (*notify_change_workers)(struct starpu_sched_component *component);
 	int properties;
 
 #ifdef STARPU_HAVE_HWLOC
+	/**
+	   the hwloc object associated to scheduler module. points to the
+	   part of topology that is binded to this component, eg: a numa
+	   node for a ws component that would balance load between
+	   underlying sockets
+	*/
 	hwloc_obj_t obj;
 #else
 	void *obj;
 #endif
 };
 
+/**
+   The actual scheduler
+*/
 struct starpu_sched_tree
 {
+	/**
+	   entry module of the scheduler
+	*/
 	struct starpu_sched_component *root;
+	/**
+	   set of workers available in this context, this value is used to mask workers in modules
+	*/
 	struct starpu_bitmap *workers;
+	/**
+	   context id of the scheduler
+	*/
 	unsigned sched_ctx_id;
+	/**
+	   lock used to protect the scheduler, it is taken in read mode pushing a task and in write mode for adding or
+	   removing workers
+	*/
 	starpu_pthread_mutex_t lock;
 };
 
+void starpu_initialize_prio_center_policy(unsigned sched_ctx_id);
+
+/**
+   @name Scheduling Tree API
+   @{
+*/
+
+/**
+   create a empty initialized starpu_sched_tree
+*/
 struct starpu_sched_tree *starpu_sched_tree_create(unsigned sched_ctx_id) STARPU_ATTRIBUTE_MALLOC;
+/**
+   destroy tree and free all non shared component in it.
+*/
 void starpu_sched_tree_destroy(struct starpu_sched_tree *tree);
 struct starpu_sched_tree *starpu_sched_tree_get(unsigned sched_ctx_id);
+/**
+   recursively set all starpu_sched_component::workers, do not take into account shared parts (except workers).
+*/
 void starpu_sched_tree_update_workers(struct starpu_sched_tree *t);
+/**
+   recursively set all starpu_sched_component::workers_in_ctx, do not take into account shared parts (except workers)
+*/
 void starpu_sched_tree_update_workers_in_ctx(struct starpu_sched_tree *t);
+/**
+   compatibility with starpu_sched_policy interface
+*/
 int starpu_sched_tree_push_task(struct starpu_task *task);
-int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task);
+/**
+   compatibility with starpu_sched_policy interface
+*/
 struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx);
+
+/**
+   Push a task to a component. This is a helper for <c>component->push_task(component, task)</c> plus tracing.
+*/
+int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task);
+
+/**
+   Pull a task from a component. This is a helper for <c>component->pull_task(component)</c> plus tracing.
+*/
 struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to);
+
 struct starpu_task* starpu_sched_component_pump_to(struct starpu_sched_component *component, struct starpu_sched_component *to, int* success);
 struct starpu_task* starpu_sched_component_pump_downstream(struct starpu_sched_component *component, int* success);
 int starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component);
-
+/**
+   compatibility with starpu_sched_policy interface
+*/
 void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
+/**
+   compatibility with starpu_sched_policy interface
+*/
 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 
+/**
+   Attach component \p child to parent \p parent. Some component may accept only one child, others accept several (e.g. MCT)
+*/
+void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child);
+
+/** @} */
+
+/**
+   @name Generic Scheduling Component API
+   @{
+*/
+
 typedef struct starpu_sched_component * (*starpu_sched_component_create_t)(struct starpu_sched_tree *tree, void *data);
+
+/**
+   allocate and initialize component field with defaults values :
+   .pop_task make recursive call on father
+   .estimated_load compute relative speedup and tasks in sub tree
+   .estimated_end return the minimum of recursive call on children
+   .add_child is starpu_sched_component_add_child
+   .remove_child is starpu_sched_component_remove_child
+   .notify_change_workers does nothing
+   .deinit_data does nothing
+*/
 struct starpu_sched_component *starpu_sched_component_create(struct starpu_sched_tree *tree, const char *name) STARPU_ATTRIBUTE_MALLOC;
-void starpu_sched_component_add_child(struct starpu_sched_component* component, struct starpu_sched_component * child);
+
+/**
+   free data allocated by starpu_sched_component_create and call component->deinit_data(component)
+   set to <c>NULL</c> the member starpu_sched_component::fathers[sched_ctx_id] of all child if its equal to \p component
+*/
+
 void starpu_sched_component_destroy(struct starpu_sched_component *component);
+/**
+   recursively destroy non shared parts of a \p component 's tree
+*/
 void starpu_sched_component_destroy_rec(struct starpu_sched_component *component);
+
+void starpu_sched_component_add_child(struct starpu_sched_component* component, struct starpu_sched_component * child);
+
+/**
+   return true iff \p component can execute \p task, this function take into account the workers available in the scheduling context
+*/
 int starpu_sched_component_can_execute_task(struct starpu_sched_component *component, struct starpu_task *task);
+
+/**
+   return a non <c>NULL</c> value if \p component can execute \p task.
+   write the execution prediction length for the best implementation of the best worker available and write this at \p length address.
+   this result is more relevant if starpu_sched_component::is_homogeneous is non <c>NULL</c>.
+   if a worker need to be calibrated for an implementation, nan is set to \p length.
+*/
 int STARPU_WARN_UNUSED_RESULT starpu_sched_component_execute_preds(struct starpu_sched_component *component, struct starpu_task *task, double *length);
+
+/**
+   return the average time to transfer \p task data to underlying \p component workers.
+*/
 double starpu_sched_component_transfer_length(struct starpu_sched_component *component, struct starpu_task *task);
+
 void starpu_sched_component_prefetch_on_node(struct starpu_sched_component *component, struct starpu_task *task);
 
-void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child);
+/** @} */
+
+/**
+   @name Worker Component API
+   @{
+*/
 
+/**
+   return the struct starpu_sched_component corresponding to \p workerid. Undefined if \p workerid is not a valid workerid
+*/
 struct starpu_sched_component *starpu_sched_component_worker_get(unsigned sched_ctx, int workerid);
 struct starpu_sched_component *starpu_sched_component_worker_new(unsigned sched_ctx, int workerid);
+
+/**
+   Create a combined worker that pushes tasks in parallel to workers \p workers (size \p nworkers).
+*/
 struct starpu_sched_component *starpu_sched_component_parallel_worker_create(struct starpu_sched_tree *tree, unsigned nworkers, unsigned *workers);
+
+/**
+   return the workerid of \p worker_component, undefined if starpu_sched_component_is_worker(worker_component) == 0
+*/
 int starpu_sched_component_worker_get_workerid(struct starpu_sched_component *worker_component);
+
+/**
+   return true iff \p component is a worker component
+*/
 int starpu_sched_component_is_worker(struct starpu_sched_component *component);
+
+/**
+   return true iff \p component is a simple worker component
+*/
 int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component);
+
+/**
+   return true iff \p component is a combined worker component
+*/
 int starpu_sched_component_is_combined_worker(struct starpu_sched_component *component);
+
+/**
+   compatibility with starpu_sched_policy interface
+   update predictions for workers
+*/
 void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
+
+/**
+   compatibility with starpu_sched_policy interface
+*/
 void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
 
+/** @} */
+
+/**
+   @name Flow-control Fifo Component API
+   @{
+*/
+
+/**
+   default function for the can_push component method, just call can_push of parents until one of them returns non-zero
+*/
 int starpu_sched_component_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to);
+
+/**
+default function for the can_pull component method, just call can_pull of children until one of them returns non-zero
+*/
 int starpu_sched_component_can_pull(struct starpu_sched_component * component);
+
+/**
+   function for the can_pull component method, call can_pull of all children
+*/
 int starpu_sched_component_can_pull_all(struct starpu_sched_component * component);
+
+/**
+   default function for the estimated_load component method, just sum up the loads
+   of the children of the component.
+*/
 double starpu_sched_component_estimated_load(struct starpu_sched_component * component);
+
+/**
+   function that can be used for the estimated_end component method, compute the minimum completion time of the children.
+*/
 double starpu_sched_component_estimated_end_min(struct starpu_sched_component * component);
+
+/**
+   function that can be used for the estimated_end component method, compute
+   the minimum completion time of the children, and add to it an estimation of how
+   existing queued work, plus the exp_len work, can be completed. This is typically
+   used instead of starpu_sched_component_estimated_end_min when the component
+   contains a queue of tasks, which thus needs to be added to the estimations.
+*/
 double starpu_sched_component_estimated_end_min_add(struct starpu_sched_component * component, double exp_len);
+
+/**
+   default function for the estimated_end component method, compute the average completion time of the children.
+*/
 double starpu_sched_component_estimated_end_average(struct starpu_sched_component * component);
 
 struct starpu_sched_component_fifo_data
@@ -145,9 +418,25 @@ struct starpu_sched_component_fifo_data
 	double exp_len_threshold;
 };
 
+/**
+   Return a struct starpu_sched_component with a fifo. A stable sort is performed according to tasks priorities.
+   A push_task call on this component does not perform recursive calls, underlying components will have to call pop_task to get it.
+   starpu_sched_component::estimated_end function compute the estimated length by dividing the sequential length by the number of underlying workers.
+*/
 struct starpu_sched_component *starpu_sched_component_fifo_create(struct starpu_sched_tree *tree, struct starpu_sched_component_fifo_data *fifo_data) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   return true iff \p component is a fifo component
+*/
 int starpu_sched_component_is_fifo(struct starpu_sched_component *component);
 
+/** @} */
+
+/**
+   @name Flow-control Prio Component API
+   @{
+*/
+
 struct starpu_sched_component_prio_data
 {
 	unsigned ntasks_threshold;
@@ -156,19 +445,70 @@ struct starpu_sched_component_prio_data
 struct starpu_sched_component *starpu_sched_component_prio_create(struct starpu_sched_tree *tree, struct starpu_sched_component_prio_data *prio_data) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_prio(struct starpu_sched_component *component);
 
+/** @} */
+
+/**
+   @name Resource-mapping Work-Stealing Component API
+   @{
+*/
+
+/**
+   return a component that perform a work stealing scheduling. Tasks are pushed in a round robin way. estimated_end return the average of expected length of fifos, starting at the average of the expected_end of his children. When a worker have to steal a task, it steal a task in a round robin way, and get the last pushed task of the higher priority.
+*/
 struct starpu_sched_component *starpu_sched_component_work_stealing_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   return true iff \p component is a work stealing component
+ */
 int starpu_sched_component_is_work_stealing(struct starpu_sched_component *component);
+
+/**
+   undefined if there is no work stealing component in the scheduler. If any, \p task is pushed in a default way if the caller is the application, and in the caller's fifo if its a worker.
+*/
 int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task);
 
+/** @} */
+
+/**
+   @name Resource-mapping Random Component API
+   @{
+*/
+
+/**
+   create a component that perform a random scheduling
+*/
 struct starpu_sched_component *starpu_sched_component_random_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   return true iff \p component is a random component
+*/
 int starpu_sched_component_is_random(struct starpu_sched_component *);
 
+/** @} */
+
+/**
+   @name Resource-mapping Eager Component API
+   @{
+*/
+
 struct starpu_sched_component *starpu_sched_component_eager_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_eager(struct starpu_sched_component *);
 
+/**
+   @name Resource-mapping Eager-Calibration Component API
+   @{
+*/
+
 struct starpu_sched_component *starpu_sched_component_eager_calibration_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_eager_calibration(struct starpu_sched_component *);
 
+/** @} */
+
+/**
+   @name Resource-mapping MCT Component API
+   @{
+*/
+
 struct starpu_sched_component_mct_data
 {
 	double alpha;
@@ -176,14 +516,48 @@ struct starpu_sched_component_mct_data
 	double _gamma;
 	double idle_power;
 };
+
+/**
+   create a component with mct_data paremeters. the mct component doesnt
+   do anything but pushing tasks on no_perf_model_component and
+   calibrating_component
+*/
 struct starpu_sched_component *starpu_sched_component_mct_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data) STARPU_ATTRIBUTE_MALLOC;
+
 int starpu_sched_component_is_mct(struct starpu_sched_component *component);
 
+/** @} */
+
+/**
+   @name Resource-mapping Heft Component API
+   @{
+*/
+
 struct starpu_sched_component *starpu_sched_component_heft_create(struct starpu_sched_tree *tree, struct starpu_sched_component_mct_data *mct_data) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_heft(struct starpu_sched_component *component);
 
+/** @} */
+
+/**
+   @name Special-purpose Best_Implementation Component API
+   @{
+*/
+
+/**
+   Select the implementation that offer the shortest computation length for the first worker that can execute the task.
+   Or an implementation that need to be calibrated.
+   Also set starpu_task::predicted and starpu_task::predicted_transfer for memory component of the first suitable workerid.
+   If starpu_sched_component::push method is called and starpu_sched_component::nchild > 1 the result is undefined.
+*/
 struct starpu_sched_component *starpu_sched_component_best_implementation_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 
+/** @} */
+
+/**
+   @name Special-purpose Perfmodel_Select Component API
+   @{
+*/
+
 struct starpu_sched_component_perfmodel_select_data
 {
 	struct starpu_sched_component *calibrator_component;
@@ -193,46 +567,168 @@ struct starpu_sched_component_perfmodel_select_data
 struct starpu_sched_component *starpu_sched_component_perfmodel_select_create(struct starpu_sched_tree *tree, struct starpu_sched_component_perfmodel_select_data *perfmodel_select_data) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_perfmodel_select(struct starpu_sched_component *component);
 
-void starpu_initialize_prio_center_policy(unsigned sched_ctx_id);
+/** @} */
 
+/**
+   @name Recipe Component API
+   @{
+*/
+
+/**
+   parameters for starpu_sched_component_composed_component_create
+*/
 struct starpu_sched_component_composed_recipe;
+
+/**
+   return an empty recipe for a composed component, it should not be used without modification
+*/
 struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create(void) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   return a recipe to build a composed component with a \p create_component
+*/
 struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create_singleton(struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg) STARPU_ATTRIBUTE_MALLOC;
+
+/**
+   add \p create_component under all previous components in recipe
+*/
 void starpu_sched_component_composed_recipe_add(struct starpu_sched_component_composed_recipe *recipe, struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg);
+
+/**
+   destroy composed_sched_component, this should be done after starpu_sched_component_composed_component_create was called
+*/
 void starpu_sched_component_composed_recipe_destroy(struct starpu_sched_component_composed_recipe *);
+
+/**
+   create a component that behave as all component of recipe where linked. Except that you cant use starpu_sched_component_is_foo function
+   if recipe contain a single create_foo arg_foo pair, create_foo(arg_foo) is returned instead of a composed component
+*/
 struct starpu_sched_component *starpu_sched_component_composed_component_create(struct starpu_sched_tree *tree, struct starpu_sched_component_composed_recipe *recipe) STARPU_ATTRIBUTE_MALLOC;
 
 #ifdef STARPU_HAVE_HWLOC
+/**
+   Define how build a scheduler according to topology. Each level (except for hwloc_machine_composed_sched_component) can be <c>NULL</c>, then
+   the level is just skipped. Bugs everywhere, do not rely on.
+*/
 struct starpu_sched_component_specs
 {
+	/**
+	   the composed component to put on the top of the scheduler
+	   this member must not be <c>NULL</c> as it is the root of the topology
+	*/
 	struct starpu_sched_component_composed_recipe *hwloc_machine_composed_sched_component;
+	/**
+	   the composed component to put for each memory component
+	*/
 	struct starpu_sched_component_composed_recipe *hwloc_component_composed_sched_component;
+	/**
+	   the composed component to put for each socket
+	*/
 	struct starpu_sched_component_composed_recipe *hwloc_socket_composed_sched_component;
+	/**
+	   the composed component to put for each cache
+	*/
 	struct starpu_sched_component_composed_recipe *hwloc_cache_composed_sched_component;
 
+	/**
+	   a function that return a starpu_sched_component_composed_recipe to put on top of a worker of type \p archtype.
+	   <c>NULL</c> is a valid return value, then no component will be added on top
+	*/
 	struct starpu_sched_component_composed_recipe *(*worker_composed_sched_component)(enum starpu_worker_archtype archtype);
+	/**
+	   this flag is a dirty hack because of the poor expressivity of this interface. As example, if you want to build
+	   a heft component with a fifo component per numa component, and you also have GPUs, if this flag is set, GPUs will share those fifos.
+	   If this flag is not set, a new fifo will be built for each of them (if they have the same starpu_perf_arch and the same
+	   numa component it will be shared. it indicates if heterogenous workers should be brothers or cousins, as example, if a gpu and a cpu should share or not there numa node
+	*/
 	int mix_heterogeneous_workers;
 };
 
+
+/**
+   build a scheduler for \p sched_ctx_id according to \p s and the hwloc topology of the machine.
+*/
 struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_ctx_id, struct starpu_sched_component_specs s);
 #endif /* STARPU_HAVE_HWLOC */
 
+/**
+   @name Basic API
+   @{
+*/
+
 #define STARPU_SCHED_SIMPLE_DECIDE_MASK		(3<<0)
+
+/**
+   Request to create downstream queues per worker, i.e. the scheduling decision-making component will choose exactly which workers tasks should got to.
+*/
 #define STARPU_SCHED_SIMPLE_DECIDE_WORKERS	(1<<0)
+
+/**
+   Request to create downstream queues per memory nodes, i.e. the scheduling decision-making component will choose which memory node tasks will go to.
+*/
 #define STARPU_SCHED_SIMPLE_DECIDE_MEMNODES	(2<<0)
+
+/**
+   Request to create downstream queues per computation arch, i.e. the scheduling decision-making component will choose whether tasks go to CPUs, or CUDA, or OpenCL, etc.
+*/
 #define STARPU_SCHED_SIMPLE_DECIDE_ARCHS	(3<<0)
 
+/**
+   Request to add a perfmodel selector above the scheduling decision-making component. That way, only tasks with a calibrated performance model will be given to the component, other tasks will go to an eager branch that will distributed tasks so that their performance models will get calibrated.
+   In other words, this is needed when using a component which needs performance models for tasks.
+*/
 #define STARPU_SCHED_SIMPLE_PERFMODEL		(1<<4)
+
+/**
+   Request that a component be added just above workers, that chooses the best task implementation.
+*/
 #define STARPU_SCHED_SIMPLE_IMPL		(1<<5)
+
+/**
+   Request to create a fifo above the scheduling decision-making component, otherwise tasks will be pushed directly to the component.
+
+   This is useful to store tasks if there is a fifo below which limits the number of tasks to be scheduld in advance. The scheduling decision-making component can also store tasks itself, in which case this flag is not useful.
+*/
 #define STARPU_SCHED_SIMPLE_FIFO_ABOVE		(1<<6)
+
+/**
+   Request that the fifo above be sorted by priorities
+*/
 #define STARPU_SCHED_SIMPLE_FIFO_ABOVE_PRIO	(1<<7)
+
+/**
+   Request to create fifos below the scheduling decision-making component, otherwise tasks will be pulled directly from workers.
+
+   This is useful to be able to schedule a (tunable) small number of tasks in advance only.
+*/
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW		(1<<8)
+
+/**
+   Request that the fifos below be sorted by priorities
+*/
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW_PRIO	(1<<9)
+
+/**
+   Request that work between workers using the same fifo below be distributed using a work stealing component.
+*/
 #define STARPU_SCHED_SIMPLE_WS_BELOW		(1<<10)
+
+/**
+   Request to not only choose between simple workers, but also choose between combined workers.
+*/
 #define STARPU_SCHED_SIMPLE_COMBINED_WORKERS	(1<<11)
 
+/**
+   Create a simple modular scheduler tree around a scheduling decision-making
+   component \p component. The details of what should be built around \p component
+   is described by \p flags. The different STARPU_SCHED_SIMPL_DECIDE_* flags are
+   mutually exclusive. \p data is passed to the \p create_decision_component
+   function when creating the decision component.
+*/
 void starpu_sched_component_initialize_simple_scheduler(starpu_sched_component_create_t create_decision_component, void *data, unsigned flags, unsigned sched_ctx_id);
 
+/** @} */
+
 #define STARPU_COMPONENT_MUTEX_LOCK(m) \
 do \
 { \