Pārlūkot izejas kodu

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into fpga

Samuel Thibault 4 gadi atpakaļ
vecāks
revīzija
c3390198fb

+ 1 - 0
ChangeLog

@@ -41,6 +41,7 @@ New features:
     StarPU.
   * Add a task prefetch level, to improve retaining data in accelerators so we
     can make prefetch more aggressive.
+  * Add starpu_data_dup_ro().
 
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.

+ 1 - 1
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -59,7 +59,7 @@ StarPU can use the FxT library (see
 https://savannah.nongnu.org/projects/fkt/) to generate traces
 with a limited runtime overhead.
 
-You can get a tarball from http://download.savannah.gnu.org/releases/fkt/
+You can get a tarball from http://download.savannah.gnu.org/releases/fkt/?C=M
 
 Compiling and installing the FxT library in the <c>$FXTDIR</c> path is
 done following the standard procedure:

+ 15 - 0
include/starpu_helper.h

@@ -187,6 +187,21 @@ double starpu_timing_now(void);
 int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
 
 /**
+   Create a copy of \p src_handle, and return a new handle in \p dst_handle,
+   which is to be used only for read accesses. This allows StarPU to optimize it
+   by not actually copying the data whenever possible (e.g. it may possibly
+   simply return src_handle itself).
+   The parameter \p asynchronous indicates whether the function should block
+   or not. In the case of an asynchronous call, it is possible to synchronize
+   with the termination of this operation either by the means of implicit
+   dependencies (if enabled) or by calling starpu_task_wait_for_all(). If
+   \p callback_func is not <c>NULL</c>, this callback function is executed after
+   the handle has been copied, and it is given the pointer \p
+   callback_arg as argument.
+*/
+int starpu_data_dup_ro(starpu_data_handle_t *dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
+
+/**
    Call hwloc-ps to display binding of each processus and thread running on
    the machine.<br>
    Use the environment variable \ref STARPU_DISPLAY_BINDINGS to automatically

+ 5 - 0
src/core/dependencies/implicit_data_deps.c

@@ -227,7 +227,12 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
 		if (mode & STARPU_W || mode == STARPU_REDUX)
 		{
+
+			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handles can not be written to");
+
 			handle->initialized = 1;
+			/* We will change our value, disconnect from our readonly duplicates */
+			handle->readonly_dup = NULL;
 			if (write_hook)
 				write_hook(handle);
 		}

+ 11 - 7
src/datawizard/coherency.c

@@ -196,10 +196,12 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 		unsigned node;
 		for (node = 0; node < nnodes; node++)
 		{
-                       _STARPU_TRACE_DATA_STATE_INVALID(handle, node);
+			if (handle->per_node[node].state != STARPU_INVALID)
+			       _STARPU_TRACE_DATA_STATE_INVALID(handle, node);
 			handle->per_node[node].state = STARPU_INVALID;
 		}
-               _STARPU_TRACE_DATA_STATE_OWNER(handle, requesting_node);
+		if (requesting_replicate->state != STARPU_OWNER)
+			_STARPU_TRACE_DATA_STATE_OWNER(handle, requesting_node);
 		requesting_replicate->state = STARPU_OWNER;
 		if (handle->home_node != -1 && handle->per_node[handle->home_node].state == STARPU_INVALID)
 			/* Notify that this MC is now dirty */
@@ -215,13 +217,15 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 			for (node = 0; node < nnodes; node++)
 			{
 				struct _starpu_data_replicate *replicate = &handle->per_node[node];
-                               if (replicate->state != STARPU_INVALID)
-			       {
-                                       _STARPU_TRACE_DATA_STATE_SHARED(handle, node);
+				if (replicate->state != STARPU_INVALID)
+				{
+					if (replicate->state != STARPU_SHARED)
+						_STARPU_TRACE_DATA_STATE_SHARED(handle, node);
 					replicate->state = STARPU_SHARED;
-                               }
+				}
 			}
-                       _STARPU_TRACE_DATA_STATE_SHARED(handle, requesting_node);
+			if (requesting_replicate->state != STARPU_SHARED)
+				_STARPU_TRACE_DATA_STATE_SHARED(handle, requesting_node);
 			requesting_replicate->state = STARPU_SHARED;
 		}
 	}

+ 24 - 13
src/datawizard/coherency.h

@@ -170,7 +170,7 @@ struct _starpu_data_state
 	 */
 	unsigned partitioned;
 	/** Whether a partition plan is currently submitted in readonly mode */
-	unsigned readonly:1;
+	unsigned part_readonly:1;
 
 	/** Whether our father is currently partitioned into ourself */
 	unsigned active:1;
@@ -191,16 +191,35 @@ struct _starpu_data_state
 	/** what is the default write-through mask for that data ? */
 	uint32_t wt_mask;
 
+	/** for a readonly handle, the number of times that we have returned again the
+	    same handle and thus the number of times we have to ignore unregistration requests */
+	unsigned aliases;
+	/** for a non-readonly handle, a readonly-only duplicate, that we can
+	    return from starpu_data_dup_ro */
+	starpu_data_handle_t readonly_dup;
+
 	/** in some case, the application may explicitly tell StarPU that a
  	 * piece of data is not likely to be used soon again */
-	unsigned is_not_important;
+	unsigned is_not_important:1;
 
 	/** Does StarPU have to enforce some implicit data-dependencies ? */
-	unsigned sequential_consistency;
+	unsigned sequential_consistency:1;
 	/** Is the data initialized, or a task is already submitted to initialize it */
-	unsigned initialized;
+	unsigned initialized:1;
+	/** Whether we shall not ever write to this handle, thus allowing various optimizations */
+	unsigned readonly:1;
 	/** Can the data be pushed to the disk? */
-	unsigned ooc;
+	unsigned ooc:1;
+
+	/** Whether lazy unregistration was requested throught starpu_data_unregister_submit */
+	unsigned lazy_unregister:1;
+
+	/** Whether automatic planned partitioning/unpartitioning should not be done */
+	int partition_automatic_disabled:1;
+
+#ifdef STARPU_OPENMP
+	unsigned removed_from_context_hash:1;
+#endif
 
 	/** This lock should protect any operation to enforce
 	 * sequential_consistency */
@@ -254,12 +273,6 @@ struct _starpu_data_state
 	/** Final request for write invalidation */
 	struct _starpu_data_request *write_invalidation_req;
 
-	unsigned lazy_unregister;
-
-#ifdef STARPU_OPENMP
-	unsigned removed_from_context_hash;
-#endif
-
         /** Used for MPI */
 	void *mpi_data;
 
@@ -279,8 +292,6 @@ struct _starpu_data_state
 	 * took it yet */
 	int last_locality;
 
-	int partition_automatic_disabled;
-
 	/** Application-provided coordinates. The maximum dimension (5) is
 	  * relatively arbitrary. */
 	unsigned dimensions;

+ 22 - 73
src/datawizard/filters.c

@@ -232,19 +232,9 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		memset(child, 0, sizeof(*child));
 		_starpu_data_handle_init(child, ops, initial_handle->mf_node);
 
-		//child->nchildren = 0;
-		//child->nplans = 0;
-		//child->switch_cl = NULL;
-		//child->partitioned = 0;
-		//child->readonly = 0;
-		child->active = inherit_state;
-		//child->active_ro = 0;
-                //child->mpi_data = NULL;
 		child->root_handle = initial_handle->root_handle;
 		child->father_handle = initial_handle;
-		//child->active_children = NULL;
-		//child->active_readonly_children = NULL;
-		//child->nactive_readonly_children = 0;
+
 		child->nsiblings = nparts;
 		if (inherit_state)
 		{
@@ -255,59 +245,25 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		child->sibling_index = i;
 		child->depth = initial_handle->depth + 1;
 
-		child->is_not_important = initial_handle->is_not_important;
-		child->wt_mask = initial_handle->wt_mask;
+		child->active = inherit_state;
+
 		child->home_node = initial_handle->home_node;
+		child->wt_mask = initial_handle->wt_mask;
 
-		/* initialize the chunk lock */
-		_starpu_data_requester_prio_list_init(&child->req_list);
-		_starpu_data_requester_prio_list_init(&child->reduction_req_list);
-		//child->reduction_tmp_handles = NULL;
-		//child->write_invalidation_req = NULL;
-		//child->refcnt = 0;
-		//child->unlocking_reqs = 0;
-		//child->busy_count = 0;
-		//child->busy_waiting = 0;
-		STARPU_PTHREAD_MUTEX_INIT0(&child->busy_mutex, NULL);
-		STARPU_PTHREAD_COND_INIT0(&child->busy_cond, NULL);
-		//child->reduction_refcnt = 0;
-		_starpu_spin_init(&child->header_lock);
+		child->aliases = initial_handle->aliases;
+
+		child->is_not_important = initial_handle->is_not_important;
 
 		child->sequential_consistency = initial_handle->sequential_consistency;
 		child->initialized = initial_handle->initialized;
+		child->readonly = initial_handle->readonly;
 		child->ooc = initial_handle->ooc;
 
-		STARPU_PTHREAD_MUTEX_INIT0(&child->sequential_consistency_mutex, NULL);
-		child->last_submitted_mode = STARPU_R;
-		//child->last_sync_task = NULL;
-		//child->last_submitted_accessors.task = NULL;
-		child->last_submitted_accessors.next = &child->last_submitted_accessors;
-		child->last_submitted_accessors.prev = &child->last_submitted_accessors;
-		//child->post_sync_tasks = NULL;
-		/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
-		STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt);
-		//child->post_sync_tasks_cnt = 0;
-
 		/* The methods used for reduction are propagated to the
 		 * children. */
 		child->redux_cl = initial_handle->redux_cl;
 		child->init_cl = initial_handle->init_cl;
 
-#ifdef STARPU_USE_FXT
-		//child->last_submitted_ghost_sync_id_is_valid = 0;
-		//child->last_submitted_ghost_sync_id = 0;
-		//child->last_submitted_ghost_accessors_id = NULL;
-#endif
-
-		if (_starpu_global_arbiter)
-			/* Just for testing purpose */
-			starpu_data_assign_arbiter(child, _starpu_global_arbiter);
-		else
-		{
-			//child->arbiter = NULL;
-		}
-		_starpu_data_requester_prio_list_init0(&child->arbitered_req_list);
-
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
 			struct _starpu_data_replicate *initial_replicate;
@@ -346,13 +302,6 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 			f->filter_func(initial_interface, child_interface, f, i, nparts);
 		}
 
-		//child->per_worker = NULL;
-		//child->user_data = NULL;
-
-		/* We compute the size and the footprint of the child once and
-		 * store it in the handle */
-		child->footprint = _starpu_compute_data_footprint(child);
-
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
 			if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
@@ -693,7 +642,7 @@ void _starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned
 	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned == 0, "One can't submit several partition plannings at the same time");
-	STARPU_ASSERT_MSG(initial_handle->readonly == 0, "One can't submit a partition planning while a readonly partitioning is active");
+	STARPU_ASSERT_MSG(initial_handle->part_readonly == 0, "One can't submit a partition planning while a readonly partitioning is active");
 	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	initial_handle->partitioned++;
 	initial_handle->active_children = children[0]->siblings;
@@ -753,10 +702,10 @@ void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle,
 	unsigned i;
 	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
 	_starpu_spin_lock(&initial_handle->header_lock);
-	STARPU_ASSERT_MSG(initial_handle->partitioned == 0 || initial_handle->readonly, "One can't submit a readonly partition planning at the same time as a readwrite partition planning");
+	STARPU_ASSERT_MSG(initial_handle->partitioned == 0 || initial_handle->part_readonly, "One can't submit a readonly partition planning at the same time as a readwrite partition planning");
 	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	initial_handle->partitioned++;
-	initial_handle->readonly = 1;
+	initial_handle->part_readonly = 1;
 	if (initial_handle->nactive_readonly_children < initial_handle->partitioned)
 	{
 		_STARPU_REALLOC(initial_handle->active_readonly_children, initial_handle->partitioned * sizeof(initial_handle->active_readonly_children[0]));
@@ -790,9 +739,9 @@ void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial
 	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned == 1, "One can't upgrade a readonly partition planning to readwrite while other readonly partition plannings are active");
-	STARPU_ASSERT_MSG(initial_handle->readonly == 1, "One can only upgrade a readonly partition planning");
+	STARPU_ASSERT_MSG(initial_handle->part_readonly == 1, "One can only upgrade a readonly partition planning");
 	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
-	initial_handle->readonly = 0;
+	initial_handle->part_readonly = 0;
 	initial_handle->active_children = initial_handle->active_readonly_children[0];
 	initial_handle->active_readonly_children[0] = NULL;
 	_starpu_spin_unlock(&initial_handle->header_lock);
@@ -819,7 +768,7 @@ void _starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsign
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for handle %p", initial_handle);
 	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
-	if (initial_handle->readonly)
+	if (initial_handle->part_readonly)
 	{
 		/* Replace this children set with the last set in the list of readonly children sets */
 		for (i = 0; i < initial_handle->partitioned-1; i++)
@@ -838,7 +787,7 @@ void _starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsign
 	}
 	initial_handle->partitioned--;
 	if (!initial_handle->partitioned)
-		initial_handle->readonly = 0;
+		initial_handle->part_readonly = 0;
 	initial_handle->active_children = NULL;
 	_starpu_spin_unlock(&initial_handle->header_lock);
 
@@ -914,7 +863,7 @@ void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for handle %p", initial_handle);
 	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
-	initial_handle->readonly = 1;
+	initial_handle->part_readonly = 1;
 	_starpu_spin_unlock(&initial_handle->header_lock);
 
 	unsigned i, n;
@@ -941,7 +890,7 @@ void starpu_data_unpartition_submit_r(starpu_data_handle_t ancestor, int gatheri
 		/* It's already unpartitioned */
 		return;
 	_STARPU_DEBUG("ancestor %p needs unpartitioning\n", ancestor);
-	if (ancestor->readonly)
+	if (ancestor->part_readonly)
 	{
 		unsigned n = ancestor->partitioned;
 		/* Uh, has to go through all read-only partitions */
@@ -988,16 +937,16 @@ static void _starpu_data_partition_access_look_up(starpu_data_handle_t ancestor,
 		_STARPU_DEBUG("ancestor %p was ready\n", ancestor);
 
 	/* We shouldn't be called for nothing */
-	STARPU_ASSERT(!ancestor->partitioned || !target || ancestor->active_children != target->siblings || (ancestor->readonly && write));
+	STARPU_ASSERT(!ancestor->partitioned || !target || ancestor->active_children != target->siblings || (ancestor->part_readonly && write));
 
 	/* Then unpartition ancestor if needed */
 	if (ancestor->partitioned &&
 			/* Not the right children, unpartition ourself */
 			((target && write && ancestor->active_children != target->siblings) ||
-			 (target && !write && !ancestor->readonly) ||
+			 (target && !write && !ancestor->part_readonly) ||
 			/* We are partitioned and we want to write or some child
 			 * is writing and we want to read, unpartition ourself*/
-			(!target && (write || !ancestor->readonly))))
+			(!target && (write || !ancestor->part_readonly))))
 	{
 #ifdef STARPU_DEVEL
 #warning FIXME: better choose gathering node
@@ -1016,14 +965,14 @@ static void _starpu_data_partition_access_look_up(starpu_data_handle_t ancestor,
 	if (ancestor->partitioned)
 	{
 		/* That must be readonly, otherwise we would have unpartitioned it */
-		STARPU_ASSERT(ancestor->readonly);
+		STARPU_ASSERT(ancestor->part_readonly);
 		if (write)
 		{
 			_STARPU_DEBUG("ancestor %p is already partitioned RO, turn RW\n", ancestor);
 			/* Already partitioned, normally it's already for the target */
 			STARPU_ASSERT(ancestor->active_children == target->siblings);
 			/* And we are here just because we haven't partitioned rw */
-			STARPU_ASSERT(ancestor->readonly && write);
+			STARPU_ASSERT(ancestor->part_readonly && write);
 			/* So we just need to upgrade ro to rw */
 			starpu_data_partition_readwrite_upgrade_submit(ancestor, target->nsiblings, target->siblings);
 		}

+ 139 - 72
src/datawizard/interfaces/data_interface.c

@@ -259,90 +259,36 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
 	STARPU_ASSERT(handle);
 
-	/* initialize the new lock */
-	_starpu_data_requester_prio_list_init0(&handle->req_list);
-	//handle->refcnt = 0;
-	//handle->unlocking_reqs = 0;
-	//handle->busy_count = 0;
-	//handle->busy_waiting = 0;
-	STARPU_PTHREAD_MUTEX_INIT0(&handle->busy_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT0(&handle->busy_cond, NULL);
-	_starpu_spin_init(&handle->header_lock);
-
 	/* first take care to properly lock the data */
 	_starpu_spin_lock(&handle->header_lock);
 
-	/* there is no hierarchy yet */
-	//handle->nchildren = 0;
-	//handle->nplans = 0;
-	//handle->switch_cl = NULL;
-	//handle->partitioned = 0;
-	//handle->readonly = 0;
-	handle->active = 1;
-	//handle->active_ro = 0;
 	handle->root_handle = handle;
 	//handle->father_handle = NULL;
-	//handle->active_children = NULL;
-	//handle->active_readonly_children = NULL;
-	//handle->nactive_readonly_children = 0;
 	//handle->nsiblings = 0;
 	//handle->siblings = NULL;
 	//handle->sibling_index = 0; /* could be anything for the root */
 	handle->depth = 1; /* the tree is just a node yet */
-        //handle->mpi_data = NULL; /* invalid until set */
+
+	handle->active = 1;
+
+	handle->home_node = home_node;
+
+	handle->wt_mask = wt_mask;
+
+	//handle->aliases = 0;
 
 	//handle->is_not_important = 0;
 
 	handle->sequential_consistency =
 		starpu_data_get_default_sequential_consistency_flag();
 	handle->initialized = home_node != -1;
+	//handle->readonly = 0;
 	handle->ooc = 1;
 
-	STARPU_PTHREAD_MUTEX_INIT0(&handle->sequential_consistency_mutex, NULL);
-	handle->last_submitted_mode = STARPU_R;
-	//handle->last_sync_task = NULL;
-	//handle->last_submitted_accessors.task = NULL;
-	handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
-	handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
-	//handle->post_sync_tasks = NULL;
-
-	/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
-	STARPU_HG_DISABLE_CHECKING(handle->post_sync_tasks_cnt);
-	//handle->post_sync_tasks_cnt = 0;
-
 	/* By default, there are no methods available to perform a reduction */
 	//handle->redux_cl = NULL;
 	//handle->init_cl = NULL;
 
-	//handle->reduction_refcnt = 0;
-	_starpu_data_requester_prio_list_init0(&handle->reduction_req_list);
-	//handle->reduction_tmp_handles = NULL;
-	//handle->write_invalidation_req = NULL;
-
-#ifdef STARPU_USE_FXT
-	//handle->last_submitted_ghost_sync_id_is_valid = 0;
-	//handle->last_submitted_ghost_sync_id = 0;
-	//handle->last_submitted_ghost_accessors_id = NULL;
-#endif
-
-	handle->wt_mask = wt_mask;
-
-	/* Store some values directly in the handle not to recompute them all
-	 * the time. */
-	handle->footprint = _starpu_compute_data_footprint(handle);
-
-	handle->home_node = home_node;
-
-	if (_starpu_global_arbiter)
-		/* Just for testing purpose */
-		starpu_data_assign_arbiter(handle, _starpu_global_arbiter);
-	else
-	{
-		//handle->arbiter = NULL;
-	}
-	_starpu_data_requester_prio_list_init0(&handle->arbitered_req_list);
-	handle->last_locality = -1;
-
 	/* that new data is invalid from all nodes perpective except for the
 	 * home node */
 	unsigned node;
@@ -372,9 +318,6 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 		}
 	}
 
-	//handle->per_worker = NULL;
-	//handle->user_data = NULL;
-
 	/* now the data is available ! */
 	_starpu_spin_unlock(&handle->header_lock);
 
@@ -449,14 +392,47 @@ int _starpu_data_handle_init(starpu_data_handle_t handle, struct starpu_data_int
 	STARPU_HG_DISABLE_CHECKING(handle->busy_count);
 
 	handle->magic = 42;
-	handle->ops = interface_ops;
-	handle->mf_node = mf_node;
-	//handle->mpi_data = NULL;
-	//handle->partition_automatic_disabled = 0;
 
+	/* When not specified, the fields are initialized in _starpu_register_new_data and _starpu_data_partition */
+
+	_starpu_data_requester_prio_list_init0(&handle->req_list);
+	//handle->refcnt = 0;
+	//handle->unlocking_reqs = 0;
+	//handle->current_mode = STARPU_NONE;
+	_starpu_spin_init(&handle->header_lock);
+
+	//handle->busy_count = 0;
+	//handle->busy_waiting = 0;
+	STARPU_PTHREAD_MUTEX_INIT0(&handle->busy_mutex, NULL);
+	STARPU_PTHREAD_COND_INIT0(&handle->busy_cond, NULL);
+
+	//handle->root_handle
+	//handle->father_handle
+	//handle->active_children = NULL;
+	//handle->active_readonly_children = NULL;
+	//handle->nactive_readonly_children = 0;
+	//handle->nsiblings
+	//handle->siblings
+	//handle->sibling_index
+	//handle->depth
+
+	/* there is no hierarchy yet */
+	//handle->children = NULL;
+	//handle->nchildren = 0;
+	//handle->nplans = 0;
+	//handle->switch_cl = NULL;
+	//handle->switch_cl_nparts = 0;
+	//handle->partitioned = 0;
+	//handle->part_readonly = 0;
+
+	//handle->active
+	//handle->active_ro = 0;
+
+	//handle->per_node below
+
+	handle->ops = interface_ops;
 	size_t interfacesize = interface_ops->interface_size;
 
-	_starpu_memory_stats_init(handle);
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 		_starpu_memory_stats_init_per_node(handle, node);
@@ -471,6 +447,80 @@ int _starpu_data_handle_init(starpu_data_handle_t handle, struct starpu_data_int
 		if (handle->ops->init) handle->ops->init(replicate->data_interface);
 	}
 
+	//handle->per_worker = NULL;
+	//handle->ops above
+
+	/* Store some values directly in the handle not to recompute them all
+	 * the time. */
+	handle->footprint = _starpu_compute_data_footprint(handle);
+
+	//handle->home_node
+	//handle->wt_mask
+	//handle->aliases = 0;
+	//handle->is_not_important
+	//handle->sequential_consistency
+	//handle->initialized
+	//handle->readonly
+	//handle->ooc
+	//handle->lazy_unregister = 0;
+	//handle->partition_automatic_disabled = 0;
+	//handle->removed_from_context_hash = 0;
+
+	STARPU_PTHREAD_MUTEX_INIT0(&handle->sequential_consistency_mutex, NULL);
+
+	handle->last_submitted_mode = STARPU_R;
+	//handle->last_sync_task = NULL;
+	//handle->last_submitted_accessors.task = NULL;
+	handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
+	handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
+
+#ifdef STARPU_USE_FXT
+	//handle->last_submitted_ghost_sync_id_is_valid = 0;
+	//handle->last_submitted_ghost_sync_id = 0;
+	//handle->last_submitted_ghost_accessors_id = NULL;
+#endif
+
+	//handle->post_sync_tasks = NULL;
+	/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
+	STARPU_HG_DISABLE_CHECKING(handle->post_sync_tasks_cnt);
+	//handle->post_sync_tasks_cnt = 0;
+
+	//handle->redux_cl
+	//handle->init_cl
+
+	//handle->reduction_refcnt = 0;
+
+	_starpu_data_requester_prio_list_init0(&handle->reduction_req_list);
+
+	//handle->reduction_tmp_handles = NULL;
+
+	//handle->write_invalidation_req = NULL;
+
+        //handle->mpi_data = NULL; /* invalid until set */
+
+	_starpu_memory_stats_init(handle);
+
+	handle->mf_node = mf_node;
+
+        //handle->unregister_hook = NULL;
+
+	if (_starpu_global_arbiter)
+		/* Just for testing purpose */
+		starpu_data_assign_arbiter(handle, _starpu_global_arbiter);
+	else
+	{
+		//handle->arbiter = NULL;
+	}
+	_starpu_data_requester_prio_list_init0(&handle->arbitered_req_list);
+
+	handle->last_locality = -1;
+
+	//handle->dimensions = 0;
+	//handle->coordinates = {};
+
+	//handle->user_data = NULL;
+
+
 	return 0;
 }
 
@@ -743,13 +793,22 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	/* TODO: also check that it has the latest coherency */
 	STARPU_ASSERT(!(nowait && handle->busy_count != 0));
 
+	_starpu_spin_lock(&handle->header_lock);
+	if (handle->aliases)
+	{
+		handle->aliases--;
+		_starpu_spin_unlock(&handle->header_lock);
+		return;
+	}
+        _starpu_spin_unlock(&handle->header_lock);
+
 	int sequential_consistency = handle->sequential_consistency;
 	if (sequential_consistency && !nowait)
 	{
 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_data_unregister must not be called from a task or callback, perhaps you can use starpu_data_unregister_submit instead");
 
 		/* If sequential consistency is enabled, wait until data is available */
-		_starpu_data_wait_until_available(handle, STARPU_RW, "starpu_data_unregister");
+		_starpu_data_wait_until_available(handle, handle->readonly?STARPU_R:STARPU_RW, "starpu_data_unregister");
 	}
 
 	if (coherent && !nowait)
@@ -982,6 +1041,14 @@ void starpu_data_unregister_submit(starpu_data_handle_t handle)
 {
 	STARPU_ASSERT_MSG(handle->magic == 42, "data %p is invalid (was it already registered?)", handle);
 	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data %p can not be unregistered twice", handle);
+	_starpu_spin_lock(&handle->header_lock);
+	if (handle->aliases)
+	{
+		handle->aliases--;
+		_starpu_spin_unlock(&handle->header_lock);
+		return;
+	}
+        _starpu_spin_unlock(&handle->header_lock);
 
 	/* Wait for all task dependencies on this handle before putting it for free */
 	starpu_data_acquire_on_node_cb(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, handle->initialized?STARPU_RW:STARPU_W, _starpu_data_unregister_submit_cb, handle);

+ 4 - 2
src/datawizard/memalloc.c

@@ -349,7 +349,8 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 			unsigned cnt = 0;
 
 			/* some other node may have the copy */
-			_STARPU_TRACE_DATA_STATE_INVALID(handle, src_node);
+			if (src_replicate->state != STARPU_INVALID)
+				_STARPU_TRACE_DATA_STATE_INVALID(handle, src_node);
 			src_replicate->state = STARPU_INVALID;
 
 			/* count the number of copies */
@@ -365,7 +366,8 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 
 			if (cnt == 1)
 			{
-				_STARPU_TRACE_DATA_STATE_OWNER(handle, last);
+				if (handle->per_node[last].state != STARPU_OWNER)
+					_STARPU_TRACE_DATA_STATE_OWNER(handle, last);
 				handle->per_node[last].state = STARPU_OWNER;
 			}
 

+ 3 - 1
src/debug/traces/starpu_fxt.c

@@ -4441,11 +4441,13 @@ void _starpu_fxt_number_events_file_close(void)
 {
 	if (number_events_file)
 	{
+		int i;
+
 		assert(number_events != NULL);
 
 		fprintf(number_events_file, "# Use starpu_fxt_number_events_to_names.py to convert event keys to event names.\n");
 
-		for (int i = 0; i <= FUT_SETUP_CODE; i++)
+		for (i = 0; i <= FUT_SETUP_CODE; i++)
 		{
 			if (number_events[i] > 0)
 				fprintf(number_events_file, "0x%x\t%lu\n", i, number_events[i]);

+ 37 - 0
src/util/starpu_data_cpy.c

@@ -175,3 +175,40 @@ int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_ha
 {
 	return _starpu_data_cpy(dst_handle, src_handle, asynchronous, callback_func, callback_arg, 0, NULL);
 }
+
+/* TODO: implement copy on write, and introduce starpu_data_dup as well */
+int starpu_data_dup_ro(starpu_data_handle_t *dst_handle, starpu_data_handle_t src_handle,
+			int asynchronous, void (*callback_func)(void*), void *callback_arg)
+{
+	_starpu_spin_lock(&src_handle->header_lock);
+	if (src_handle->readonly_dup) {
+		/* Already a ro duplicate, just return it with one more ref */
+		*dst_handle = src_handle->readonly_dup;
+		_starpu_spin_unlock(&src_handle->header_lock);
+		_starpu_spin_lock(&(*dst_handle)->header_lock);
+		(*dst_handle)->aliases++;
+		_starpu_spin_unlock(&(*dst_handle)->header_lock);
+		if (callback_func)
+			callback_func(callback_arg);
+		return 0;
+	}
+	if (src_handle->readonly) {
+		src_handle->aliases++;
+		_starpu_spin_unlock(&src_handle->header_lock);
+		*dst_handle = src_handle;
+		if (callback_func)
+			callback_func(callback_arg);
+		return 0;
+	}
+	_starpu_spin_unlock(&src_handle->header_lock);
+
+	starpu_data_register_same(dst_handle, src_handle);
+	_starpu_data_cpy(*dst_handle, src_handle, asynchronous, callback_func, callback_arg, 0, NULL);
+	(*dst_handle)->readonly = 1;
+
+	_starpu_spin_lock(&src_handle->header_lock);
+	src_handle->readonly_dup = (*dst_handle);
+	_starpu_spin_unlock(&src_handle->header_lock);
+
+	return 0;
+}

+ 34 - 21
tests/Makefile.am

@@ -344,6 +344,7 @@ myPROGRAMS +=				\
 	errorcheck/workers_cpuid		\
 	fault-tolerance/retry			\
 	helper/starpu_data_cpy			\
+	helper/starpu_data_dup_ro		\
 	helper/starpu_create_sync_task		\
 	microbenchs/async_tasks_overhead	\
 	microbenchs/sync_tasks_overhead		\
@@ -359,11 +360,11 @@ myPROGRAMS +=				\
 	parallel_tasks/parallel_kernels_spmd	\
 	parallel_tasks/spmd_peager		\
 	parallel_tasks/cuda_only		\
-	perfmodels/regression_based		\
-	perfmodels/regression_based_01		\
-	perfmodels/regression_based_02		\
-	perfmodels/regression_based_03		\
-	perfmodels/regression_based_04		\
+	perfmodels/regression_based_memset	\
+	perfmodels/regression_based_check	\
+	perfmodels/regression_based_multiimpl	\
+	perfmodels/regression_based_energy	\
+	perfmodels/regression_based_gpu		\
 	perfmodels/non_linear_regression_based	\
 	perfmodels/feed				\
 	perfmodels/user_base			\
@@ -652,28 +653,32 @@ main_insert_task_where_SOURCES +=		\
 endif
 
 main_subgraph_repeat_SOURCES =		\
-	main/subgraph_repeat.c
+	main/subgraph_repeat.c		\
+	main/increment_codelet.c
 if STARPU_USE_CUDA
 main_subgraph_repeat_SOURCES +=		\
 	main/increment.cu
 endif
 
 main_subgraph_repeat_tag_SOURCES =		\
-	main/subgraph_repeat_tag.c
+	main/subgraph_repeat_tag.c		\
+	main/increment_codelet.c
 if STARPU_USE_CUDA
 main_subgraph_repeat_tag_SOURCES +=		\
 	main/increment.cu
 endif
 
 main_subgraph_repeat_regenerate_SOURCES =		\
-	main/subgraph_repeat_regenerate.c
+	main/subgraph_repeat_regenerate.c		\
+	main/increment_codelet.c
 if STARPU_USE_CUDA
 main_subgraph_repeat_regenerate_SOURCES +=		\
 	main/increment.cu
 endif
 
 main_subgraph_repeat_regenerate_tag_SOURCES =		\
-	main/subgraph_repeat_regenerate_tag.c
+	main/subgraph_repeat_regenerate_tag.c		\
+	main/increment_codelet.c
 if STARPU_USE_CUDA
 main_subgraph_repeat_regenerate_tag_SOURCES +=		\
 	main/increment.cu
@@ -799,6 +804,14 @@ fortran90_init_01_SOURCES =	\
 	fortran90/init_01.f90
 endif
 
+helper_starpu_data_dup_ro_SOURCES =		\
+	helper/starpu_data_dup_ro.c		\
+	main/increment_codelet.c
+if STARPU_USE_CUDA
+helper_starpu_data_dup_ro_SOURCES +=		\
+	main/increment.cu
+endif
+
 ###################
 # Block interface #
 ###################
@@ -1006,20 +1019,20 @@ overlap_gpu_concurrency_SOURCES+=\
 	overlap/long_kernel.cu
 endif
 
-perfmodels_regression_based_SOURCES=\
-	perfmodels/regression_based.c
+perfmodels_regression_based_memset_SOURCES=\
+	perfmodels/regression_based_memset.c
 
-perfmodels_regression_based_01_SOURCES=\
-	perfmodels/regression_based_01.c
+perfmodels_regression_based_check_SOURCES=\
+	perfmodels/regression_based_check.c
 
-perfmodels_regression_based_02_SOURCES=\
-	perfmodels/regression_based_02.c
+perfmodels_regression_based_multiimpl_SOURCES=\
+	perfmodels/regression_based_multiimpl.c
 
-perfmodels_regression_based_03_SOURCES=\
-	perfmodels/regression_based_03.c
+perfmodels_regression_based_energy_SOURCES=\
+	perfmodels/regression_based_energy.c
 
-perfmodels_regression_based_04_SOURCES=\
-	perfmodels/regression_based_04.c
+perfmodels_regression_based_gpu_SOURCES=\
+	perfmodels/regression_based_gpu.c
 
 perfmodels_max_fpga_SOURCES=\
 	perfmodels/max_fpga.c
@@ -1027,10 +1040,10 @@ perfmodels_max_fpga_LDADD = $(LDADD) \
 	$(srcdir)/perfmodels/slic_StreamFMA.o
 
 if STARPU_USE_OPENCL
-perfmodels_regression_based_SOURCES+=\
+perfmodels_regression_based_memset_SOURCES+=\
 	perfmodels/opencl_memset.c
 
-perfmodels_regression_based_04_SOURCES+=\
+perfmodels_regression_based_gpu_SOURCES+=\
 	perfmodels/opencl_memset.c
 
 nobase_STARPU_OPENCL_DATA_DATA += \

+ 107 - 0
tests/helper/starpu_data_dup_ro.c

@@ -0,0 +1,107 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../main/increment_codelet.h"
+#include "../helper.h"
+
+/*
+ * Test starpu_data_dup_ro
+ */
+
+int main(int argc, char **argv)
+{
+	int ret;
+	unsigned var1, *var;
+	starpu_data_handle_t var1_handle, var2_handle, var3_handle, var4_handle, var5_handle;
+
+	ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	var1 = 42;
+
+	starpu_variable_data_register(&var1_handle, STARPU_MAIN_RAM, (uintptr_t)&var1, sizeof(var1));
+
+	/* Make a duplicate of the original data */
+	ret = starpu_data_dup_ro(&var2_handle, var1_handle, 1, NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_dup_ro");
+
+	/* Make a second duplicate of the original data */
+	ret = starpu_data_dup_ro(&var3_handle, var1_handle, 1, NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_dup_ro");
+	STARPU_ASSERT(var3_handle == var2_handle);
+
+	/* Make a duplicate of a duplicate */
+	ret = starpu_data_dup_ro(&var4_handle, var2_handle, 1, NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_dup_ro");
+	STARPU_ASSERT(var4_handle == var2_handle);
+
+	starpu_task_insert(&increment_codelet, STARPU_RW, var1_handle, 0);
+
+	/* Make a duplicate of the new value */
+	ret = starpu_data_dup_ro(&var5_handle, var1_handle, 1, NULL, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_dup_ro");
+
+	starpu_data_acquire(var2_handle, STARPU_R);
+	var = starpu_data_get_local_ptr(var2_handle);
+	ret = EXIT_SUCCESS;
+	if (*var != 42)
+	{
+	     FPRINTF(stderr, "var2 is %d but it should be %d\n", *var, 42);
+	     ret = EXIT_FAILURE;
+	}
+	starpu_data_release(var2_handle);
+
+	starpu_data_acquire(var3_handle, STARPU_R);
+	var = starpu_data_get_local_ptr(var3_handle);
+	ret = EXIT_SUCCESS;
+	if (*var != 42)
+	{
+	     FPRINTF(stderr, "var3 is %d but it should be %d\n", *var, 42);
+	     ret = EXIT_FAILURE;
+	}
+	starpu_data_release(var3_handle);
+
+	starpu_data_acquire(var4_handle, STARPU_R);
+	var = starpu_data_get_local_ptr(var4_handle);
+	ret = EXIT_SUCCESS;
+	if (*var != 42)
+	{
+	     FPRINTF(stderr, "var4 is %d but it should be %d\n", *var, 42);
+	     ret = EXIT_FAILURE;
+	}
+	starpu_data_release(var4_handle);
+
+	starpu_data_acquire(var5_handle, STARPU_R);
+	var = starpu_data_get_local_ptr(var5_handle);
+	ret = EXIT_SUCCESS;
+	if (*var != 43)
+	{
+	     FPRINTF(stderr, "var5 is %d but it should be %d\n", *var, 43);
+	     ret = EXIT_FAILURE;
+	}
+	starpu_data_release(var5_handle);
+
+	starpu_data_unregister(var1_handle);
+	starpu_data_unregister(var2_handle);
+	starpu_data_unregister(var3_handle);
+	starpu_data_unregister(var4_handle);
+	starpu_data_unregister(var5_handle);
+	starpu_shutdown();
+
+	STARPU_RETURN(ret);
+}

+ 40 - 0
tests/main/increment_codelet.c

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "increment_codelet.h"
+
+void cpu_increment(void *descr[], void *arg)
+{
+	(void)arg;
+	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	(*var)++;
+}
+
+struct starpu_codelet increment_codelet =
+{
+	.cpu_funcs = {cpu_increment},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {cuda_host_increment},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+	// TODO
+	//.opencl_funcs = {dummy_func},
+	.cpu_funcs_name = {"cpu_increment"},
+	.model = NULL,
+	.modes = { STARPU_RW },
+	.nbuffers = 1
+};
+

+ 21 - 0
tests/main/increment_codelet.h

@@ -0,0 +1,21 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+extern void cuda_host_increment(void *descr[], void *_args);
+extern void cpu_increment(void *descr[], void *arg);
+extern struct starpu_codelet increment_codelet;

+ 5 - 28
tests/main/subgraph_repeat.c

@@ -17,6 +17,7 @@
 #include <starpu.h>
 #include <common/thread.h>
 
+#include "increment_codelet.h"
 #include "../helper.h"
 
 /*
@@ -51,30 +52,6 @@ static unsigned *check_cnt;
 static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 
-extern void cuda_host_increment(void *descr[], void *_args);
-
-void cpu_increment(void *descr[], void *arg)
-{
-	(void)arg;
-	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	(*var)++;
-}
-
-static struct starpu_codelet dummy_codelet =
-{
-	.cpu_funcs = {cpu_increment},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {cuda_host_increment},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
-#endif
-	// TODO
-	//.opencl_funcs = {dummy_func},
-	.cpu_funcs_name = {"cpu_increment"},
-	.model = NULL,
-	.modes = { STARPU_RW },
-	.nbuffers = 1
-};
-
 static void callback_task_D(void *arg)
 {
 	(void)arg;
@@ -120,19 +97,19 @@ int main(int argc, char **argv)
 	starpu_variable_data_register(&check_data, STARPU_MAIN_RAM, (uintptr_t)check_cnt, sizeof(*check_cnt));
 
 	starpu_task_init(&taskA);
-	taskA.cl = &dummy_codelet;
+	taskA.cl = &increment_codelet;
 	taskA.handles[0] = check_data;
 
 	starpu_task_init(&taskB);
-	taskB.cl = &dummy_codelet;
+	taskB.cl = &increment_codelet;
 	taskB.handles[0] = check_data;
 
 	starpu_task_init(&taskC);
-	taskC.cl = &dummy_codelet;
+	taskC.cl = &increment_codelet;
 	taskC.handles[0] = check_data;
 
 	starpu_task_init(&taskD);
-	taskD.cl = &dummy_codelet;
+	taskD.cl = &increment_codelet;
 	taskD.callback_func = callback_task_D;
 	taskD.handles[0] = check_data;
 

+ 5 - 26
tests/main/subgraph_repeat_regenerate.c

@@ -17,6 +17,7 @@
 #include <starpu.h>
 #include <common/thread.h>
 
+#include "increment_codelet.h"
 #include "../helper.h"
 
 /*
@@ -55,28 +56,6 @@ static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 
 extern void cuda_host_increment(void *descr[], void *_args);
 
-void cpu_increment(void *descr[], void *arg)
-{
-	(void)arg;
-	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	(*var)++;
-}
-
-static struct starpu_codelet dummy_codelet =
-{
-	.cpu_funcs = {cpu_increment},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {cuda_host_increment},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
-#endif
-	// TODO
-	//.opencl_funcs = {dummy_func},
-	.cpu_funcs_name = {"cpu_increment"},
-	.model = NULL,
-	.modes = { STARPU_RW },
-	.nbuffers = 1
-};
-
 static void callback_task_B(void *arg)
 {
 	(void)arg;
@@ -136,24 +115,24 @@ int main(int argc, char **argv)
 	starpu_variable_data_register(&check_data, STARPU_MAIN_RAM, (uintptr_t)check_cnt, sizeof(*check_cnt));
 
 	starpu_task_init(&taskA);
-	taskA.cl = &dummy_codelet;
+	taskA.cl = &increment_codelet;
 	taskA.regenerate = 0; /* this task will be explicitely resubmitted if needed */
 	taskA.handles[0] = check_data;
 
 	starpu_task_init(&taskB);
-	taskB.cl = &dummy_codelet;
+	taskB.cl = &increment_codelet;
 	taskB.callback_func = callback_task_B;
 	taskB.regenerate = 1;
 	taskB.handles[0] = check_data;
 
 	starpu_task_init(&taskC);
-	taskC.cl = &dummy_codelet;
+	taskC.cl = &increment_codelet;
 	taskC.callback_func = callback_task_C;
 	taskC.regenerate = 1;
 	taskC.handles[0] = check_data;
 
 	starpu_task_init(&taskD);
-	taskD.cl = &dummy_codelet;
+	taskD.cl = &increment_codelet;
 	taskD.callback_func = callback_task_D;
 	taskD.regenerate = 1;
 	taskD.handles[0] = check_data;

+ 5 - 28
tests/main/subgraph_repeat_regenerate_tag.c

@@ -17,6 +17,7 @@
 #include <starpu.h>
 #include <common/thread.h>
 
+#include "increment_codelet.h"
 #include "../helper.h"
 
 /*
@@ -61,30 +62,6 @@ static unsigned *check_cnt;
 static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 
-extern void cuda_host_increment(void *descr[], void *_args);
-
-void cpu_increment(void *descr[], void *arg)
-{
-	(void)arg;
-	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	(*var)++;
-}
-
-static struct starpu_codelet dummy_codelet =
-{
-	.cpu_funcs = {cpu_increment},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {cuda_host_increment},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
-#endif
-	// TODO
-	//.opencl_funcs = {dummy_func},
-	.cpu_funcs_name = {"cpu_increment"},
-	.model = NULL,
-	.modes = { STARPU_RW },
-	.nbuffers = 1
-};
-
 static void callback_task_A(void *arg)
 {
 	(void)arg;
@@ -169,7 +146,7 @@ int main(int argc, char **argv)
 	starpu_variable_data_register(&check_data, STARPU_MAIN_RAM, (uintptr_t)check_cnt, sizeof(*check_cnt));
 
 	starpu_task_init(&taskA);
-	taskA.cl = &dummy_codelet;
+	taskA.cl = &increment_codelet;
 	taskA.regenerate = 1; /* this task will be explicitely resubmitted if needed */
 	taskA.use_tag = 1;
 	taskA.tag_id = TAG_A;
@@ -177,7 +154,7 @@ int main(int argc, char **argv)
 	taskA.handles[0] = check_data;
 
 	starpu_task_init(&taskB);
-	taskB.cl = &dummy_codelet;
+	taskB.cl = &increment_codelet;
 	taskB.regenerate = 1;
 	taskB.use_tag = 1;
 	taskB.tag_id = TAG_B;
@@ -185,7 +162,7 @@ int main(int argc, char **argv)
 	taskB.handles[0] = check_data;
 
 	starpu_task_init(&taskC);
-	taskC.cl = &dummy_codelet;
+	taskC.cl = &increment_codelet;
 	taskC.regenerate = 1;
 	taskC.use_tag = 1;
 	taskC.tag_id = TAG_C;
@@ -193,7 +170,7 @@ int main(int argc, char **argv)
 	taskC.handles[0] = check_data;
 
 	starpu_task_init(&taskD);
-	taskD.cl = &dummy_codelet;
+	taskD.cl = &increment_codelet;
 	taskD.callback_func = callback_task_D;
 	taskD.regenerate = 1;
 	taskD.use_tag = 1;

+ 5 - 28
tests/main/subgraph_repeat_tag.c

@@ -16,6 +16,7 @@
 
 #include <starpu.h>
 
+#include "increment_codelet.h"
 #include "../helper.h"
 
 /*
@@ -52,30 +53,6 @@ static unsigned *check_cnt;
 static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 
-extern void cuda_host_increment(void *descr[], void *_args);
-
-void cpu_increment(void *descr[], void *arg)
-{
-	(void)arg;
-	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	(*var)++;
-}
-
-static struct starpu_codelet dummy_codelet =
-{
-	.cpu_funcs = {cpu_increment},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {cuda_host_increment},
-	.cuda_flags = {STARPU_CUDA_ASYNC},
-#endif
-	// TODO
-	//.opencl_funcs = {dummy_func},
-	.cpu_funcs_name = {"cpu_increment"},
-	.model = NULL,
-	.modes = { STARPU_RW },
-	.nbuffers = 1
-};
-
 static void callback_task_B(void *arg)
 {
 	(void)arg;
@@ -145,24 +122,24 @@ int main(int argc, char **argv)
 	starpu_variable_data_register(&check_data, STARPU_MAIN_RAM, (uintptr_t)check_cnt, sizeof(*check_cnt));
 
 	starpu_task_init(&taskA);
-	taskA.cl = &dummy_codelet;
+	taskA.cl = &increment_codelet;
 	taskA.regenerate = 0; /* this task will be explicitely resubmitted if needed */
 	taskA.handles[0] = check_data;
 
 	starpu_task_init(&taskB);
-	taskB.cl = &dummy_codelet;
+	taskB.cl = &increment_codelet;
 	taskB.regenerate = 1;
 	taskB.callback_func = callback_task_B;
 	taskB.handles[0] = check_data;
 
 	starpu_task_init(&taskC);
-	taskC.cl = &dummy_codelet;
+	taskC.cl = &increment_codelet;
 	taskC.regenerate = 1;
 	taskC.callback_func = callback_task_C;
 	taskC.handles[0] = check_data;
 
 	starpu_task_init(&taskD);
-	taskD.cl = &dummy_codelet;
+	taskD.cl = &increment_codelet;
 	taskD.callback_func = callback_task_D;
 	taskD.regenerate = 1;
 	taskD.handles[0] = check_data;

tests/perfmodels/regression_based_01.c → tests/perfmodels/regression_based_check.c


tests/perfmodels/regression_based_03.c → tests/perfmodels/regression_based_energy.c


tests/perfmodels/regression_based_04.c → tests/perfmodels/regression_based_gpu.c


+ 1 - 1
tests/perfmodels/regression_based.c

@@ -20,7 +20,7 @@
 #include "../helper.h"
 
 /*
- * Benchmark memset with a linear regression
+ * Benchmark memset with a linear and non-linear regression
  */
 
 #define STARTlin 1024

tests/perfmodels/regression_based_02.c → tests/perfmodels/regression_based_multiimpl.c