11 роки тому · 2c125dafe1
--- a/configure.ac
+++ b/configure.ac
@@ -853,6 +853,10 @@ if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
 
																 	  enable_opencl=$have_valid_opencl
															
 
																           ;;
															
 
																    esac
															
 
																+   save_LIBS="$LIBS"
															
 
																+   LIBS="$LIBS $STARPU_OPENCL_LDFLAGS"
															
 
																+   AC_CHECK_FUNCS([clEnqueueMarkerWithWaitList])
															
 
																+   LIBS="$save_LIBS"
															
 
																 fi
															
 
																 AC_MSG_CHECKING(whether OpenCL should be used)
															
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -52,29 +52,37 @@ func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
 
																 cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 \endcode
															
 
																+Unfortunately, some CUDA libraries do not have stream variants of
															
 
																+kernels. That will lower the potential for overlapping.
															
 
																+
															
 
																 StarPU already does appropriate calls for the CUBLAS library.
															
 
																 If the kernel can be made to only use this local stream or other self-allocated
															
 
																 streams, i.e. the whole kernel submission can be made asynchronous, then
															
 
																-one should enable asynchronous execution of the kernel. This means setting
															
 
																-the corresponding cuda_flags[] flag in the codelet and dropping the
															
 
																-cudaStreamSynchronize() call at the end of the kernel. That way, StarPU will be
															
 
																-able to pipeline submitting tasks to GPUs, instead of synchronizing at each
															
 
																+one should enable asynchronous execution of the kernel.  That means setting
															
 
																+the STARPU_CUDA_FLAG flag in cuda_flags[] in the codelet, and dropping the
															
 
																+cudaStreamSynchronize() call at the end of the cuda_func function, so that it
															
 
																+returns immediately after having queued the kernel to the local stream. That way, StarPU will be
															
 
																+able to submit and complete data transfers while kernels are executing, instead of only at each
															
 
																 kernel submission. The kernel just has to make sure that StarPU can use the
															
 
																 local stream to synchronize with the kernel startup and completion.
															
 
																-Unfortunately, some CUDA libraries do not have stream variants of
															
 
																-kernels. That will lower the potential for overlapping.
															
 
																+Using the STARPU_CUDA_FLAG flag also permits to enabled concurrent kernel
															
 
																+execution, on cards which support it (Kepler and later, notably). This is
															
 
																+enabled by setting the STARPU_NWORKER_PER_CUDA environment variable to the
															
 
																+number of kernels to execute concurrently.  This is useful when kernels are
															
 
																+small and do not feed the whole GPU with threads to run.
															
 
																 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
															
 
																 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
															
 
																-streams, i.e. the whole kernel submission can be made asynchronous, then
															
 
																+queues, i.e. the whole kernel submission can be made asynchronous, then
															
 
																 one should enable asynchronous execution of the kernel. This means setting
															
 
																 the corresponding opencl_flags[] flag in the codelet and dropping the
															
 
																-clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel.
															
 
																-That way, StarPU will be able to pipeline submitting tasks to GPUs, instead of
															
 
																-synchronizing at each kernel submission. The kernel just has to make sure
															
 
																+clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel, so
															
 
																+that it returns immediately after having queued the kernel to the provided queue.
															
 
																+That way, StarPU will be able to submit and complete data transfers while kernels are executing, instead of
															
 
																+only at each kernel submission. The kernel just has to make sure
															
 
																 that StarPU can use the command queue it has provided to synchronize with the
															
 
																 kernel startup and completion.
															
--- a/doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
+++ b/doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
@@ -1,7 +1,7 @@
 
																 /*
															
 
																  * This file is part of the StarPU Handbook.
															
 
																  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																  * See the file version.doxy for copying conditions.
															
 
																  */
															
@@ -192,7 +192,7 @@ The <b>Throughput </b> strategy focuses on maximizing the throughput of the reso
 
																 and resizes the contexts such that the machine is running at its maximum efficiency
															
 
																 (maximum instant speed of the workers).
															
 
																-\section  Defining a new hypervisor policy
															
 
																+\section DefiningANewHypervisorPolicy Defining A New Hypervisor Policy
															
 
																 While Scheduling Context Hypervisor Plugin comes with a variety of
															
 
																 resizing policies (see \ref ResizingStrategies), it may sometimes be
															
--- a/doc/doxygen/chapters/21simgrid.doxy
+++ b/doc/doxygen/chapters/21simgrid.doxy
@@ -22,7 +22,7 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 
																 virtual timestamp in us.
															
 
																 For some technical reason, the application's .c file which contains main() has
															
 
																-to be recompiled with starpu.h, which in the simgrid case will #define main()
															
 
																+to be recompiled with starpu.h, which in the simgrid case will # define main()
															
 
																 into starpu_main(), and it is libstarpu which will provide the real main() and
															
 
																 call the application's main().
															
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
@@ -43,6 +43,14 @@ environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
 
																 create as many CUDA workers as there are CUDA devices.
															
 
																 </dd>
															
 
																+<dt>STARPU_NWORKER_PER_CUDA</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_NWORKER_PER_CUDA
															
 
																+\addindex __env__STARPU_NWORKER_PER_CUDA
															
 
																+Specify the number of workers per CUDA device, and thus the number of kernels
															
 
																+which will be concurrently running on the devices. The default value is 1.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>STARPU_NOPENCL</dt>
															
 
																 <dd>
															
 
																 \anchor STARPU_NOPENCL
															
--- a/doc/doxygen/chapters/api/threads.doxy
+++ b/doc/doxygen/chapters/api/threads.doxy
@@ -152,7 +152,7 @@ This function initializes the thread attributes object pointed to by
 
																 \p attr with default attribute values.
															
 
																 It does not do anything when the simulated performance mode is enabled
															
 
																-(\ref SimulatedPerformance).
															
 
																+(\ref SimGridSupport).
															
 
																 \fn int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr)
															
 
																 \ingroup API_Threads
															
@@ -161,7 +161,7 @@ required. Destroying a thread attributes object has no effect on
 
																 threads that were created using that object.
															
 
																 It does not do anything when the simulated performance mode is enabled
															
 
																-(\ref SimulatedPerformance).
															
 
																+(\ref SimGridSupport).
															
 
																 \fn int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate)
															
 
																 \ingroup API_Threads
															
@@ -172,7 +172,7 @@ created using the thread attributes object \p attr will be created in
 
																 a joinable or a detached state.
															
 
																 It does not do anything when the simulated performance mode is enabled
															
 
																-(\ref SimulatedPerformance).
															
 
																+(\ref SimGridSupport).
															
 
																 \fn int starpu_pthread_mutex_init(starpu_pthread_mutex_t *mutex, const starpu_pthread_mutexattr_t *mutexattr)
															
 
																 \ingroup API_Threads
															
--- a/doc/doxygen/doxygen.cfg
+++ b/doc/doxygen/doxygen.cfg
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																 # Copyright (C) 2009-2013  Université de Bordeaux 1
															
 
																-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																 # Copyright (C) 2011  Télécom-SudParis
															
 
																 # Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
															
 
																 #
															
@@ -645,13 +645,13 @@ QUIET                  = YES
 
																 # generated by doxygen. Possible values are YES and NO. If left blank
															
 
																 # NO is used.
															
 
																-WARNINGS               = YES
															
 
																+WARNINGS               = NO
															
 
																 # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
															
 
																 # for undocumented members. If EXTRACT_ALL is set to YES then this flag will
															
 
																 # automatically be disabled.
															
 
																-WARN_IF_UNDOCUMENTED   = YES
															
 
																+WARN_IF_UNDOCUMENTED   = NO
															
 
																 # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
															
 
																 # potential errors in the documentation, such as not documenting some
															
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -107,6 +107,7 @@ struct starpu_perfmodel_per_arch
 
																 enum starpu_perfmodel_type
															
 
																 {
															
 
																+        STARPU_PERFMODEL_INVALID=0,
															
 
																 	STARPU_PER_ARCH,
															
 
																 	STARPU_COMMON,
															
 
																 	STARPU_HISTORY_BASED,
															
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009-2014  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -482,7 +482,7 @@ int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, uns
 
																 			break;
															
 
																 		}
															
 
																-		*pushed++;
															
 
																+		(*pushed)++;
															
 
																 	}
															
 
																 	while (!_starpu_data_request_list_empty(local_list))
															
@@ -570,7 +570,7 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 
																 			break;
															
 
																 		}
															
 
																-		*pushed++;
															
 
																+		(*pushed)++;
															
 
																 	}
															
 
																 	while(!_starpu_data_request_list_empty(local_list))
															
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009-2014  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -259,7 +259,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 
																 	if (mc->automatically_allocated &&
															
 
																 		(!handle || replicate->refcnt == 0))
															
 
																 	{
															
 
																-		void *interface;
															
 
																+		void *data_interface;
															
 
																 		if (handle)
															
 
																 			STARPU_ASSERT(replicate->allocated);
															
@@ -276,13 +276,13 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 
																 #endif
															
 
																 		if (handle)
															
 
																-			interface = replicate->data_interface;
															
 
																+			data_interface = replicate->data_interface;
															
 
																 		else
															
 
																-			interface = mc->chunk_interface;
															
 
																-		STARPU_ASSERT(interface);
															
 
																+			data_interface = mc->chunk_interface;
															
 
																+		STARPU_ASSERT(data_interface);
															
 
																 		_STARPU_TRACE_START_FREE(node, mc->size);
															
 
																-		mc->ops->free_data_on_node(interface, node);
															
 
																+		mc->ops->free_data_on_node(data_interface, node);
															
 
																 		_STARPU_TRACE_END_FREE(node);
															
 
																 		if (handle)
															
@@ -417,7 +417,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
																  * therefore not in the cache. */
															
 
																 static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
															
 
																 {
															
 
																-	void *interface;
															
 
																+	void *data_interface;
															
 
																 	/* we found an appropriate mem chunk: so we get it out
															
 
																 	 * of the "to free" list, and reassign it to the new
															
@@ -429,18 +429,18 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
																 		old_replicate->allocated = 0;
															
 
																 		old_replicate->automatically_allocated = 0;
															
 
																 		old_replicate->initialized = 0;
															
 
																-		interface = old_replicate->data_interface;
															
 
																+		data_interface = old_replicate->data_interface;
															
 
																 	}
															
 
																 	else
															
 
																-		interface = mc->chunk_interface;
															
 
																+		data_interface = mc->chunk_interface;
															
 
																 	new_replicate->allocated = 1;
															
 
																 	new_replicate->automatically_allocated = 1;
															
 
																 	new_replicate->initialized = 0;
															
 
																 	STARPU_ASSERT(new_replicate->data_interface);
															
 
																-	STARPU_ASSERT(interface);
															
 
																-	memcpy(new_replicate->data_interface, interface, mc->size_interface);
															
 
																+	STARPU_ASSERT(data_interface);
															
 
																+	memcpy(new_replicate->data_interface, data_interface, mc->size_interface);
															
 
																 	if (!old_replicate)
															
 
																 	{
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -685,7 +685,11 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
																 		int err;
															
 
																 		cl_command_queue queue;
															
 
																 		starpu_opencl_get_queue(args->devid, &queue);
															
 
																+#ifdef HAVE_CLENQUEUEMARKERWITHWAITLIST
															
 
																+		err = clEnqueueMarkerWithWaitList(queue, 0, NULL, &task_events[args->devid]);
															
 
																+#else
															
 
																 		err = clEnqueueMarker(queue, &task_events[args->devid]);
															
 
																+#endif
															
 
																 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	}
															
 
																 	else
															
--- a/tests/perfmodels/user_base.c
+++ b/tests/perfmodels/user_base.c
@@ -29,7 +29,7 @@ size_t get_size_base(struct starpu_task *task, unsigned nimpl)
 
																 	return 3;
															
 
																 };
															
 
																-size_t get_footprint(struct starpu_task *task)
															
 
																+uint32_t get_footprint(struct starpu_task *task)
															
 
																 {
															
 
																 	uint32_t orig = starpu_task_data_footprint(task);
															
 
																 	return starpu_hash_crc32c_be(42, orig);