Browse Source

- merge trunk

Olivier Aumage 11 years ago
parent
commit
2c125dafe1

+ 4 - 0
configure.ac

@@ -853,6 +853,10 @@ if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
 	  enable_opencl=$have_valid_opencl
 	  enable_opencl=$have_valid_opencl
           ;;
           ;;
    esac
    esac
+   save_LIBS="$LIBS"
+   LIBS="$LIBS $STARPU_OPENCL_LDFLAGS"
+   AC_CHECK_FUNCS([clEnqueueMarkerWithWaitList])
+   LIBS="$save_LIBS"
 fi
 fi
 
 
 AC_MSG_CHECKING(whether OpenCL should be used)
 AC_MSG_CHECKING(whether OpenCL should be used)

+ 18 - 10
doc/doxygen/chapters/05check_list_performance.doxy

@@ -52,29 +52,37 @@ func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
 cudaStreamSynchronize(starpu_cuda_get_local_stream());
 cudaStreamSynchronize(starpu_cuda_get_local_stream());
 \endcode
 \endcode
 
 
+Unfortunately, some CUDA libraries do not have stream variants of
+kernels. That will lower the potential for overlapping.
+
 StarPU already does appropriate calls for the CUBLAS library.
 StarPU already does appropriate calls for the CUBLAS library.
 
 
 If the kernel can be made to only use this local stream or other self-allocated
 If the kernel can be made to only use this local stream or other self-allocated
 streams, i.e. the whole kernel submission can be made asynchronous, then
 streams, i.e. the whole kernel submission can be made asynchronous, then
-one should enable asynchronous execution of the kernel. This means setting
-the corresponding cuda_flags[] flag in the codelet and dropping the
-cudaStreamSynchronize() call at the end of the kernel. That way, StarPU will be
-able to pipeline submitting tasks to GPUs, instead of synchronizing at each
+one should enable asynchronous execution of the kernel.  That means setting
+the STARPU_CUDA_FLAG flag in cuda_flags[] in the codelet, and dropping the
+cudaStreamSynchronize() call at the end of the cuda_func function, so that it
+returns immediately after having queued the kernel to the local stream. That way, StarPU will be
+able to submit and complete data transfers while kernels are executing, instead of only at each
 kernel submission. The kernel just has to make sure that StarPU can use the
 kernel submission. The kernel just has to make sure that StarPU can use the
 local stream to synchronize with the kernel startup and completion.
 local stream to synchronize with the kernel startup and completion.
 
 
-Unfortunately, some CUDA libraries do not have stream variants of
-kernels. That will lower the potential for overlapping.
+Using the STARPU_CUDA_FLAG flag also permits to enabled concurrent kernel
+execution, on cards which support it (Kepler and later, notably). This is
+enabled by setting the STARPU_NWORKER_PER_CUDA environment variable to the
+number of kernels to execute concurrently.  This is useful when kernels are
+small and do not feed the whole GPU with threads to run.
 
 
 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
 
 
 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
-streams, i.e. the whole kernel submission can be made asynchronous, then
+queues, i.e. the whole kernel submission can be made asynchronous, then
 one should enable asynchronous execution of the kernel. This means setting
 one should enable asynchronous execution of the kernel. This means setting
 the corresponding opencl_flags[] flag in the codelet and dropping the
 the corresponding opencl_flags[] flag in the codelet and dropping the
-clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel.
-That way, StarPU will be able to pipeline submitting tasks to GPUs, instead of
-synchronizing at each kernel submission. The kernel just has to make sure
+clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel, so
+that it returns immediately after having queued the kernel to the provided queue.
+That way, StarPU will be able to submit and complete data transfers while kernels are executing, instead of
+only at each kernel submission. The kernel just has to make sure
 that StarPU can use the command queue it has provided to synchronize with the
 that StarPU can use the command queue it has provided to synchronize with the
 kernel startup and completion.
 kernel startup and completion.
 
 

+ 2 - 2
doc/doxygen/chapters/10scheduling_context_hypervisor.doxy

@@ -1,7 +1,7 @@
 /*
 /*
  * This file is part of the StarPU Handbook.
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  * See the file version.doxy for copying conditions.
  */
  */
@@ -192,7 +192,7 @@ The <b>Throughput </b> strategy focuses on maximizing the throughput of the reso
 and resizes the contexts such that the machine is running at its maximum efficiency
 and resizes the contexts such that the machine is running at its maximum efficiency
 (maximum instant speed of the workers).
 (maximum instant speed of the workers).
 
 
-\section  Defining a new hypervisor policy
+\section DefiningANewHypervisorPolicy Defining A New Hypervisor Policy
 
 
 While Scheduling Context Hypervisor Plugin comes with a variety of
 While Scheduling Context Hypervisor Plugin comes with a variety of
 resizing policies (see \ref ResizingStrategies), it may sometimes be
 resizing policies (see \ref ResizingStrategies), it may sometimes be

+ 1 - 1
doc/doxygen/chapters/21simgrid.doxy

@@ -22,7 +22,7 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 virtual timestamp in us.
 virtual timestamp in us.
 
 
 For some technical reason, the application's .c file which contains main() has
 For some technical reason, the application's .c file which contains main() has
-to be recompiled with starpu.h, which in the simgrid case will #define main()
+to be recompiled with starpu.h, which in the simgrid case will # define main()
 into starpu_main(), and it is libstarpu which will provide the real main() and
 into starpu_main(), and it is libstarpu which will provide the real main() and
 call the application's main().
 call the application's main().
 
 

+ 8 - 0
doc/doxygen/chapters/40environment_variables.doxy

@@ -43,6 +43,14 @@ environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
 create as many CUDA workers as there are CUDA devices.
 create as many CUDA workers as there are CUDA devices.
 </dd>
 </dd>
 
 
+<dt>STARPU_NWORKER_PER_CUDA</dt>
+<dd>
+\anchor STARPU_NWORKER_PER_CUDA
+\addindex __env__STARPU_NWORKER_PER_CUDA
+Specify the number of workers per CUDA device, and thus the number of kernels
+which will be concurrently running on the devices. The default value is 1.
+</dd>
+
 <dt>STARPU_NOPENCL</dt>
 <dt>STARPU_NOPENCL</dt>
 <dd>
 <dd>
 \anchor STARPU_NOPENCL
 \anchor STARPU_NOPENCL

+ 3 - 3
doc/doxygen/chapters/api/threads.doxy

@@ -152,7 +152,7 @@ This function initializes the thread attributes object pointed to by
 \p attr with default attribute values.
 \p attr with default attribute values.
 
 
 It does not do anything when the simulated performance mode is enabled
 It does not do anything when the simulated performance mode is enabled
-(\ref SimulatedPerformance).
+(\ref SimGridSupport).
 
 
 \fn int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr)
 \fn int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr)
 \ingroup API_Threads
 \ingroup API_Threads
@@ -161,7 +161,7 @@ required. Destroying a thread attributes object has no effect on
 threads that were created using that object.
 threads that were created using that object.
 
 
 It does not do anything when the simulated performance mode is enabled
 It does not do anything when the simulated performance mode is enabled
-(\ref SimulatedPerformance).
+(\ref SimGridSupport).
 
 
 \fn int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate)
 \fn int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate)
 \ingroup API_Threads
 \ingroup API_Threads
@@ -172,7 +172,7 @@ created using the thread attributes object \p attr will be created in
 a joinable or a detached state.
 a joinable or a detached state.
 
 
 It does not do anything when the simulated performance mode is enabled
 It does not do anything when the simulated performance mode is enabled
-(\ref SimulatedPerformance).
+(\ref SimGridSupport).
 
 
 \fn int starpu_pthread_mutex_init(starpu_pthread_mutex_t *mutex, const starpu_pthread_mutexattr_t *mutexattr)
 \fn int starpu_pthread_mutex_init(starpu_pthread_mutex_t *mutex, const starpu_pthread_mutexattr_t *mutexattr)
 \ingroup API_Threads
 \ingroup API_Threads

+ 3 - 3
doc/doxygen/doxygen.cfg

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
 # Copyright (C) 2009-2013  Université de Bordeaux 1
 # Copyright (C) 2009-2013  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
 # Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
 #
 #
@@ -645,13 +645,13 @@ QUIET                  = YES
 # generated by doxygen. Possible values are YES and NO. If left blank
 # generated by doxygen. Possible values are YES and NO. If left blank
 # NO is used.
 # NO is used.
 
 
-WARNINGS               = YES
+WARNINGS               = NO
 
 
 # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
 # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
 # for undocumented members. If EXTRACT_ALL is set to YES then this flag will
 # for undocumented members. If EXTRACT_ALL is set to YES then this flag will
 # automatically be disabled.
 # automatically be disabled.
 
 
-WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_UNDOCUMENTED   = NO
 
 
 # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
 # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
 # potential errors in the documentation, such as not documenting some
 # potential errors in the documentation, such as not documenting some

+ 2 - 1
include/starpu_perfmodel.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -107,6 +107,7 @@ struct starpu_perfmodel_per_arch
 
 
 enum starpu_perfmodel_type
 enum starpu_perfmodel_type
 {
 {
+        STARPU_PERFMODEL_INVALID=0,
 	STARPU_PER_ARCH,
 	STARPU_PER_ARCH,
 	STARPU_COMMON,
 	STARPU_COMMON,
 	STARPU_HISTORY_BASED,
 	STARPU_HISTORY_BASED,

+ 3 - 3
src/datawizard/data_request.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2009-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -482,7 +482,7 @@ int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, uns
 			break;
 			break;
 		}
 		}
 
 
-		*pushed++;
+		(*pushed)++;
 	}
 	}
 
 
 	while (!_starpu_data_request_list_empty(local_list))
 	while (!_starpu_data_request_list_empty(local_list))
@@ -570,7 +570,7 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 			break;
 			break;
 		}
 		}
 
 
-		*pushed++;
+		(*pushed)++;
 	}
 	}
 
 
 	while(!_starpu_data_request_list_empty(local_list))
 	while(!_starpu_data_request_list_empty(local_list))

+ 11 - 11
src/datawizard/memalloc.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2009-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -259,7 +259,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 	if (mc->automatically_allocated &&
 	if (mc->automatically_allocated &&
 		(!handle || replicate->refcnt == 0))
 		(!handle || replicate->refcnt == 0))
 	{
 	{
-		void *interface;
+		void *data_interface;
 
 
 		if (handle)
 		if (handle)
 			STARPU_ASSERT(replicate->allocated);
 			STARPU_ASSERT(replicate->allocated);
@@ -276,13 +276,13 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 #endif
 #endif
 
 
 		if (handle)
 		if (handle)
-			interface = replicate->data_interface;
+			data_interface = replicate->data_interface;
 		else
 		else
-			interface = mc->chunk_interface;
-		STARPU_ASSERT(interface);
+			data_interface = mc->chunk_interface;
+		STARPU_ASSERT(data_interface);
 
 
 		_STARPU_TRACE_START_FREE(node, mc->size);
 		_STARPU_TRACE_START_FREE(node, mc->size);
-		mc->ops->free_data_on_node(interface, node);
+		mc->ops->free_data_on_node(data_interface, node);
 		_STARPU_TRACE_END_FREE(node);
 		_STARPU_TRACE_END_FREE(node);
 
 
 		if (handle)
 		if (handle)
@@ -417,7 +417,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
  * therefore not in the cache. */
  * therefore not in the cache. */
 static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
 static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
 {
 {
-	void *interface;
+	void *data_interface;
 
 
 	/* we found an appropriate mem chunk: so we get it out
 	/* we found an appropriate mem chunk: so we get it out
 	 * of the "to free" list, and reassign it to the new
 	 * of the "to free" list, and reassign it to the new
@@ -429,18 +429,18 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 		old_replicate->allocated = 0;
 		old_replicate->allocated = 0;
 		old_replicate->automatically_allocated = 0;
 		old_replicate->automatically_allocated = 0;
 		old_replicate->initialized = 0;
 		old_replicate->initialized = 0;
-		interface = old_replicate->data_interface;
+		data_interface = old_replicate->data_interface;
 	}
 	}
 	else
 	else
-		interface = mc->chunk_interface;
+		data_interface = mc->chunk_interface;
 
 
 	new_replicate->allocated = 1;
 	new_replicate->allocated = 1;
 	new_replicate->automatically_allocated = 1;
 	new_replicate->automatically_allocated = 1;
 	new_replicate->initialized = 0;
 	new_replicate->initialized = 0;
 
 
 	STARPU_ASSERT(new_replicate->data_interface);
 	STARPU_ASSERT(new_replicate->data_interface);
-	STARPU_ASSERT(interface);
-	memcpy(new_replicate->data_interface, interface, mc->size_interface);
+	STARPU_ASSERT(data_interface);
+	memcpy(new_replicate->data_interface, data_interface, mc->size_interface);
 
 
 	if (!old_replicate)
 	if (!old_replicate)
 	{
 	{

+ 4 - 0
src/drivers/opencl/driver_opencl.c

@@ -685,7 +685,11 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		int err;
 		int err;
 		cl_command_queue queue;
 		cl_command_queue queue;
 		starpu_opencl_get_queue(args->devid, &queue);
 		starpu_opencl_get_queue(args->devid, &queue);
+#ifdef HAVE_CLENQUEUEMARKERWITHWAITLIST
+		err = clEnqueueMarkerWithWaitList(queue, 0, NULL, &task_events[args->devid]);
+#else
 		err = clEnqueueMarker(queue, &task_events[args->devid]);
 		err = clEnqueueMarker(queue, &task_events[args->devid]);
+#endif
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	}
 	else
 	else

+ 1 - 1
tests/perfmodels/user_base.c

@@ -29,7 +29,7 @@ size_t get_size_base(struct starpu_task *task, unsigned nimpl)
 	return 3;
 	return 3;
 };
 };
 
 
-size_t get_footprint(struct starpu_task *task)
+uint32_t get_footprint(struct starpu_task *task)
 {
 {
 	uint32_t orig = starpu_task_data_footprint(task);
 	uint32_t orig = starpu_task_data_footprint(task);
 	return starpu_hash_crc32c_be(42, orig);
 	return starpu_hash_crc32c_be(42, orig);