|
@@ -9,7 +9,7 @@
|
|
|
@menu
|
|
|
* Initialization and Termination:: Initialization and Termination methods
|
|
|
* Workers' Properties:: Methods to enumerate workers' properties
|
|
|
-* Data Management:: Methods to manipulate data
|
|
|
+* Data Library:: Methods to manipulate data
|
|
|
* Data Interfaces::
|
|
|
* Data Partition::
|
|
|
* Codelets and Tasks:: Methods to construct tasks
|
|
@@ -36,27 +36,9 @@ Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
|
|
|
indicates that no worker was available (so that StarPU was not initialized).
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftp {Data Type} {struct starpu_driver}
|
|
|
-@table @asis
|
|
|
-@item @code{enum starpu_archtype type}
|
|
|
-The type of the driver. Only STARPU_CPU_DRIVER, STARPU_CUDA_DRIVER and
|
|
|
-STARPU_OPENCL_DRIVER are currently supported.
|
|
|
-@item @code{union id} Anonymous union
|
|
|
-@table @asis
|
|
|
-@item @code{unsigned cpu_id}
|
|
|
-Should only be used if type is STARPU_CPU_WORKER.
|
|
|
-@item @code{unsigned cuda_id}
|
|
|
-Should only be used if type is STARPU_CUDA_WORKER.
|
|
|
-@item @code{cl_device_id opencl_id}
|
|
|
-Should only be used if type is STARPU_OPENCL_WORKER.
|
|
|
-@end table
|
|
|
-@end table
|
|
|
-@end deftp
|
|
|
-
|
|
|
-
|
|
|
@deftp {Data Type} {struct starpu_conf}
|
|
|
This structure is passed to the @code{starpu_init} function in order
|
|
|
-to configure StarPU. It has to be initialized with @code{starpu_conf_init}.
|
|
|
+to configure StarPU.
|
|
|
When the default value is used, StarPU automatically selects the number of
|
|
|
processing units and takes the default scheduling policy. The environment
|
|
|
variables overwrite the equivalent parameters.
|
|
@@ -72,7 +54,7 @@ if @code{sched_policy_name} is set.
|
|
|
|
|
|
@item @code{int ncpus} (default = -1)
|
|
|
This is the number of CPU cores that StarPU can use. This can also be
|
|
|
-specified with the @code{STARPU_NCPU} environment variable.
|
|
|
+specified with the @code{STARPU_NCPUS} environment variable.
|
|
|
|
|
|
@item @code{int ncuda} (default = -1)
|
|
|
This is the number of CUDA devices that StarPU can use. This can also
|
|
@@ -123,19 +105,11 @@ contains the logical identifiers of the OpenCL devices to be used.
|
|
|
|
|
|
@item @code{int calibrate} (default = 0)
|
|
|
If this flag is set, StarPU will calibrate the performance models when
|
|
|
-executing tasks. If this value is equal to @code{-1}, the default value is
|
|
|
-used. If the value is equal to @code{1}, it will force continuing
|
|
|
-calibration. If the value is equal to @code{2}, the existing performance
|
|
|
-models will be overwritten. This can also be specified with the
|
|
|
-@code{STARPU_CALIBRATE} environment variable.
|
|
|
-
|
|
|
-@item @code{int bus_calibrate} (default = 0)
|
|
|
-If this flag is set, StarPU will recalibrate the bus. If this value is equal
|
|
|
-to @code{-1}, the default value is used. This can also be specified with the
|
|
|
-@code{STARPU_BUS_CALIBRATE} environment variable.
|
|
|
+executing tasks. If this value is equal to -1, the default value is used. This
|
|
|
+can also be specified with the @code{STARPU_CALIBRATE} environment variable.
|
|
|
|
|
|
@item @code{int single_combined_worker} (default = 0)
|
|
|
-By default, StarPU executes parallel tasks concurrently.
|
|
|
+By default, StarPU parallel tasks concurrently.
|
|
|
Some parallel libraries (e.g. most OpenMP implementations) however do
|
|
|
not support concurrent calls to parallel code. In such case, setting this flag
|
|
|
makes StarPU only start one parallel task at a time.
|
|
@@ -143,46 +117,11 @@ This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environ
|
|
|
|
|
|
@item @code{int disable_asynchronous_copy} (default = 0)
|
|
|
This flag should be set to 1 to disable asynchronous copies between
|
|
|
-CPUs and all accelerators. This can also be specified with the
|
|
|
+CPUs and accelerators. This can also be specified with the
|
|
|
@code{STARPU_DISABLE_ASYNCHRONOUS_COPY} environment variable.
|
|
|
The AMD implementation of OpenCL is known to
|
|
|
fail when copying data asynchronously. When using this implementation,
|
|
|
it is therefore necessary to disable asynchronous data transfers.
|
|
|
-This can also be specified at compilation time by giving to the
|
|
|
-configure script the option @code{--disable-asynchronous-copy}.
|
|
|
-
|
|
|
-@item @code{int disable_cuda_asynchronous_copy} (default = 0)
|
|
|
-This flag should be set to 1 to disable asynchronous copies between
|
|
|
-CPUs and CUDA accelerators. This can also be specified with the
|
|
|
-@code{STARPU_DISABLE_CUDA_ASYNCHRONOUS_COPY} environment variable.
|
|
|
-This can also be specified at compilation time by giving to the
|
|
|
-configure script the option @code{--disable-asynchronous-cuda-copy}.
|
|
|
-
|
|
|
-@item @code{int disable_opencl_asynchronous_copy} (default = 0)
|
|
|
-This flag should be set to 1 to disable asynchronous copies between
|
|
|
-CPUs and OpenCL accelerators. This can also be specified with the
|
|
|
-@code{STARPU_DISABLE_OPENCL_ASYNCHRONOUS_COPY} environment variable.
|
|
|
-The AMD implementation of OpenCL is known to
|
|
|
-fail when copying data asynchronously. When using this implementation,
|
|
|
-it is therefore necessary to disable asynchronous data transfers.
|
|
|
-This can also be specified at compilation time by giving to the
|
|
|
-configure script the option @code{--disable-asynchronous-opencl-copy}.
|
|
|
-
|
|
|
-@item @code{int *cuda_opengl_interoperability} (default = NULL)
|
|
|
-This can be set to an array of CUDA device identifiers for which
|
|
|
-@code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
|
|
|
-size is specified by the @code{n_cuda_opengl_interoperability} field below
|
|
|
-
|
|
|
-@item @code{int *n_cuda_opengl_interoperability} (default = 0)
|
|
|
-This has to be set to the size of the array pointed to by the
|
|
|
-@code{cuda_opengl_interoperability} field.
|
|
|
-
|
|
|
-@item @code{struct starpu_driver *not_launched_drivers}
|
|
|
-The drivers that should not be launched by StarPU.
|
|
|
-
|
|
|
-@item @code{unsigned nnot_launched_drivers}
|
|
|
-The number of StarPU drivers that should not be launched by StarPU.
|
|
|
-
|
|
|
@end table
|
|
|
@end deftp
|
|
|
|
|
@@ -192,7 +131,7 @@ with the default values. In case some configuration parameters are already
|
|
|
specified through environment variables, @code{starpu_conf_init} initializes
|
|
|
the fields of the structure according to the environment variables. For
|
|
|
instance if @code{STARPU_CALIBRATE} is set, its value is put in the
|
|
|
-@code{.calibrate} field of the structure passed as argument.
|
|
|
+@code{.ncuda} field of the structure passed as argument.
|
|
|
|
|
|
Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
|
|
|
indicates that the argument was NULL.
|
|
@@ -209,16 +148,6 @@ Return 1 if asynchronous data transfers between CPU and accelerators
|
|
|
are disabled.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun int starpu_asynchronous_cuda_copy_disabled ()
|
|
|
-Return 1 if asynchronous data transfers between CPU and CUDA accelerators
|
|
|
-are disabled.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun int starpu_asynchronous_opencl_copy_disabled ()
|
|
|
-Return 1 if asynchronous data transfers between CPU and OpenCL accelerators
|
|
|
-are disabled.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@node Workers' Properties
|
|
|
@section Workers' Properties
|
|
|
|
|
@@ -238,8 +167,8 @@ StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
|
|
|
@end deftypefun
|
|
|
|
|
|
@deftypefun int starpu_worker_get_count_by_type ({enum starpu_archtype} @var{type})
|
|
|
-Returns the number of workers of the given @var{type}. A positive
|
|
|
-(or @code{NULL}) value is returned in case of success, @code{-EINVAL} indicates that
|
|
|
+Returns the number of workers of the given type indicated by the argument. A positive
|
|
|
+(or null) value is returned in case of success, @code{-EINVAL} indicates that
|
|
|
the type is not valid otherwise.
|
|
|
@end deftypefun
|
|
|
|
|
@@ -335,12 +264,12 @@ this function should be used in the allocation function to determine
|
|
|
on which device the memory needs to be allocated.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@node Data Management
|
|
|
-@section Data Management
|
|
|
+@node Data Library
|
|
|
+@section Data Library
|
|
|
|
|
|
@menu
|
|
|
-* Introduction to Data Management::
|
|
|
-* Basic Data Management API::
|
|
|
+* Introduction to Data Library::
|
|
|
+* Basic Data Library API::
|
|
|
* Access registered data from the application::
|
|
|
@end menu
|
|
|
|
|
@@ -349,7 +278,7 @@ This section describes the data management facilities provided by StarPU.
|
|
|
We show how to use existing data interfaces in @ref{Data Interfaces}, but developers can
|
|
|
design their own data interfaces if required.
|
|
|
|
|
|
-@node Introduction to Data Management
|
|
|
+@node Introduction to Data Library
|
|
|
@subsection Introduction
|
|
|
Data management is done at a high-level in StarPU: rather than accessing a mere
|
|
|
list of contiguous buffers, the tasks may manipulate data that are described by
|
|
@@ -377,8 +306,8 @@ to StarPU, the specified memory node indicates where the piece of data
|
|
|
initially resides (we also call this memory node the home node of a piece of
|
|
|
data).
|
|
|
|
|
|
-@node Basic Data Management API
|
|
|
-@subsection Basic Data Management API
|
|
|
+@node Basic Data Library API
|
|
|
+@subsection Basic Data Library API
|
|
|
|
|
|
@deftypefun int starpu_malloc (void **@var{A}, size_t @var{dim})
|
|
|
This function allocates data of the given size in main memory. It will also try to pin it in
|
|
@@ -468,10 +397,6 @@ access to the handle must be performed in write-only mode. Accessing an
|
|
|
invalidated data in read-mode results in undefined behaviour.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun void starpu_data_invalidate_submit (starpu_data_handle_t @var{handle})
|
|
|
-Submits invalidation of the data handle after completion of previously submitted tasks.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@c TODO create a specific sections about user interaction with the DSM ?
|
|
|
|
|
|
@deftypefun void starpu_data_set_wt_mask (starpu_data_handle_t @var{handle}, uint32_t @var{wt_mask})
|
|
@@ -527,7 +452,7 @@ be consistent with the access mode specified in the @var{mode} argument.
|
|
|
access the piece of data anymore. Note that implicit data
|
|
|
dependencies are also enforced by @code{starpu_data_acquire}, i.e.
|
|
|
@code{starpu_data_acquire} will wait for all tasks scheduled to work on
|
|
|
-the data, unless they have been disabled explictly by calling
|
|
|
+the data, unless that they have not been disabled explictly by calling
|
|
|
@code{starpu_data_set_default_sequential_consistency_flag} or
|
|
|
@code{starpu_data_set_sequential_consistency_flag}.
|
|
|
@code{starpu_data_acquire} is a blocking call, so that it cannot be called from
|
|
@@ -538,28 +463,18 @@ tasks or from their callbacks (in that case, @code{starpu_data_acquire} returns
|
|
|
|
|
|
@deftypefun int starpu_data_acquire_cb (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
|
|
|
@code{starpu_data_acquire_cb} is the asynchronous equivalent of
|
|
|
-@code{starpu_data_acquire}. When the data specified in the first argument is
|
|
|
+@code{starpu_data_release}. When the data specified in the first argument is
|
|
|
available in the appropriate access mode, the callback function is executed.
|
|
|
The application may access the requested data during the execution of this
|
|
|
callback. The callback function must call @code{starpu_data_release} once the
|
|
|
application does not need to access the piece of data anymore.
|
|
|
Note that implicit data dependencies are also enforced by
|
|
|
-@code{starpu_data_acquire_cb} in case they are not disabled.
|
|
|
+@code{starpu_data_acquire_cb} in case they are enabled.
|
|
|
Contrary to @code{starpu_data_acquire}, this function is non-blocking and may
|
|
|
be called from task callbacks. Upon successful completion, this function
|
|
|
returns 0.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun int starpu_data_acquire_on_node (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_access_mode} @var{mode})
|
|
|
-This is the same as @code{starpu_data_acquire}, except that the data will be
|
|
|
-available on the given memory node instead of main memory.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun int starpu_data_acquire_on_node_cb (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
|
|
|
-This is the same as @code{starpu_data_acquire_cb}, except that the data will be
|
|
|
-available on the given memory node instead of main memory.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@defmac STARPU_DATA_ACQUIRE_CB (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode}, code)
|
|
|
@code{STARPU_DATA_ACQUIRE_CB} is the same as @code{starpu_data_acquire_cb},
|
|
|
except that the code to be executed in a callback is directly provided as a
|
|
@@ -663,12 +578,12 @@ starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
|
|
|
@deftypefun void starpu_bcsr_data_register (starpu_data_handle_t *@var{handle}, uint32_t @var{home_node}, uint32_t @var{nnz}, uint32_t @var{nrow}, uintptr_t @var{nzval}, uint32_t *@var{colind}, uint32_t *@var{rowptr}, uint32_t @var{firstentry}, uint32_t @var{r}, uint32_t @var{c}, size_t @var{elemsize})
|
|
|
This variant of @code{starpu_data_register} uses the BCSR (Blocked
|
|
|
Compressed Sparse Row Representation) sparse matrix interface.
|
|
|
-Register the sparse matrix made of @var{nnz} non-zero blocks of elements of size
|
|
|
+Register the sparse matrix made of @var{nnz} non-zero values of size
|
|
|
@var{elemsize} stored in @var{nzval} and initializes @var{handle} to represent
|
|
|
it. Blocks have size @var{r} * @var{c}. @var{nrow} is the number of rows (in
|
|
|
-terms of blocks), @code{colind[i]} is the block-column index for block @code{i}
|
|
|
-in @code{nzval}, @code{rowptr[i]} is the block-index (in nzval) of the first block of row @code{i}.
|
|
|
-@var{firstentry} is the index of the first entry of the given arrays (usually 0
|
|
|
+terms of blocks), @var{colind} is the list of positions of the non-zero entries
|
|
|
+on the row, @var{rowptr} is the index (in nzval) of the first entry of the row.
|
|
|
+@var{fristentry} is the index of the first entry of the given arrays (usually 0
|
|
|
or 1).
|
|
|
@end deftypefun
|
|
|
|
|
@@ -732,23 +647,6 @@ if @var{handle}'s interface does not have data allocated locally
|
|
|
Return the unique identifier of the interface associated with the given @var{handle}.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun size_t starpu_handle_get_size (starpu_data_handle_t @var{handle})
|
|
|
-Return the size of the data associated with @var{handle}
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun int starpu_handle_pack_data (starpu_data_handle_t @var{handle}, {void **}@var{ptr})
|
|
|
-Allocates a buffer large enough at @var{ptr} and copy to the newly
|
|
|
-allocated buffer the data associated to @var{handle}. The interface of
|
|
|
-the data registered at @var{handle} must define a packing operation
|
|
|
-(@pxref{struct starpu_data_interface_ops}).
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun int starpu_handle_unpack_data (starpu_data_handle_t @var{handle}, {void *}@var{ptr})
|
|
|
-Copy in @var{handle} the data located at @var{ptr} as described by the
|
|
|
-interface of the data. The interface registered at @var{handle} must
|
|
|
-define a unpacking operation (@pxref{struct starpu_data_interface_ops}).
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@node Accessing Variable Data Interfaces
|
|
|
@subsubsection Variable Data Interfaces
|
|
|
|
|
@@ -1110,7 +1008,7 @@ subdata according to the filter @var{f}, as shown in the following example:
|
|
|
@cartouche
|
|
|
@smallexample
|
|
|
struct starpu_data_filter f = @{
|
|
|
- .filter_func = starpu_block_filter_func,
|
|
|
+ .filter_func = starpu_vertical_block_filter_func,
|
|
|
.nchildren = nslicesx,
|
|
|
.get_nchildren = NULL,
|
|
|
.get_child_ops = NULL
|
|
@@ -1122,8 +1020,7 @@ starpu_data_partition(A_handle, &f);
|
|
|
|
|
|
@deftypefun void starpu_data_unpartition (starpu_data_handle_t @var{root_data}, uint32_t @var{gathering_node})
|
|
|
This unapplies one filter, thus unpartitioning the data. The pieces of data are
|
|
|
-collected back into one big piece in the @var{gathering_node} (usually 0). Tasks
|
|
|
-working on the partitioned data must be already finished when calling @code{starpu_data_unpartition}.
|
|
|
+collected back into one big piece in the @var{gathering_node} (usually 0).
|
|
|
@cartouche
|
|
|
@smallexample
|
|
|
starpu_data_unpartition(A_handle, 0);
|
|
@@ -1176,16 +1073,38 @@ starpu_data_filter.
|
|
|
@subsection Predefined filter functions
|
|
|
|
|
|
@menu
|
|
|
-* Partitioning Vector Data::
|
|
|
-* Partitioning Matrix Data::
|
|
|
-* Partitioning 3D Matrix Data::
|
|
|
* Partitioning BCSR Data::
|
|
|
+* Partitioning BLAS interface::
|
|
|
+* Partitioning Vector Data::
|
|
|
+* Partitioning Block Data::
|
|
|
@end menu
|
|
|
|
|
|
This section gives a partial list of the predefined partitioning functions.
|
|
|
Examples on how to use them are shown in @ref{Partitioning Data}. The complete
|
|
|
list can be found in @code{starpu_data_filters.h} .
|
|
|
|
|
|
+@node Partitioning BCSR Data
|
|
|
+@subsubsection Partitioning BCSR Data
|
|
|
+
|
|
|
+@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
+This partitions a block-sparse matrix into dense matrices.
|
|
|
+@end deftypefun
|
|
|
+
|
|
|
+@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
+This partitions a block-sparse matrix into vertical block-sparse matrices.
|
|
|
+@end deftypefun
|
|
|
+
|
|
|
+@node Partitioning BLAS interface
|
|
|
+@subsubsection Partitioning BLAS interface
|
|
|
+
|
|
|
+@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
+This partitions a dense Matrix into horizontal blocks.
|
|
|
+@end deftypefun
|
|
|
+
|
|
|
+@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
+This partitions a dense Matrix into vertical blocks.
|
|
|
+@end deftypefun
|
|
|
+
|
|
|
@node Partitioning Vector Data
|
|
|
@subsubsection Partitioning Vector Data
|
|
|
|
|
@@ -1195,18 +1114,6 @@ vector represented by @var{father_interface} once partitioned in
|
|
|
@var{nparts} chunks of equal size.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-Return in @code{*@var{child_interface}} the @var{id}th element of the
|
|
|
-vector represented by @var{father_interface} once partitioned in
|
|
|
-@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow
|
|
|
-
|
|
|
-The @code{filter_arg_ptr} field must be the shadow size casted into @code{void*}.
|
|
|
-
|
|
|
-IMPORTANT: This can only be used for read-only access, as no coherency is
|
|
|
-enforced for the shadowed parts.
|
|
|
-
|
|
|
-A usage example is available in examples/filters/shadow.c
|
|
|
-@end deftypefun
|
|
|
|
|
|
@deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
Return in @code{*@var{child_interface}} the @var{id}th element of the
|
|
@@ -1227,107 +1134,11 @@ chunks of equal size, ignoring @var{nparts}. Thus, @var{id} must be
|
|
|
@end deftypefun
|
|
|
|
|
|
|
|
|
-@node Partitioning Matrix Data
|
|
|
-@subsubsection Partitioning Matrix Data
|
|
|
-
|
|
|
-@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a dense Matrix along the x dimension, thus getting (x/nparts,y)
|
|
|
-matrices. If nparts does not divide x, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a dense Matrix along the x dimension, with a shadow border
|
|
|
-@code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y)
|
|
|
-matrices. If nparts does not divide x-2*shadow, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-
|
|
|
-IMPORTANT: This can only be used for read-only access, as no coherency is
|
|
|
-enforced for the shadowed parts.
|
|
|
-
|
|
|
-A usage example is available in examples/filters/shadow2d.c
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a dense Matrix along the y dimension, thus getting (x,y/nparts)
|
|
|
-matrices. If nparts does not divide y, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_vertical_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a dense Matrix along the y dimension, with a shadow border
|
|
|
-@code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow)
|
|
|
-matrices. If nparts does not divide y-2*shadow, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-
|
|
|
-IMPORTANT: This can only be used for read-only access, as no coherency is
|
|
|
-enforced for the shadowed parts.
|
|
|
-
|
|
|
-A usage example is available in examples/filters/shadow2d.c
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@node Partitioning 3D Matrix Data
|
|
|
-@subsubsection Partitioning 3D Matrix Data
|
|
|
-
|
|
|
-A usage example is available in examples/filters/shadow3d.c
|
|
|
+@node Partitioning Block Data
|
|
|
+@subsubsection Partitioning Block Data
|
|
|
|
|
|
@deftypefun void starpu_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a 3D matrix along the X dimension, thus getting (x/nparts,y,z)
|
|
|
-3D matrices. If nparts does not divide x, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a 3D matrix along the X dimension, with a shadow border
|
|
|
-@code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y,z) 3D
|
|
|
-matrices. If nparts does not divide x, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-
|
|
|
-IMPORTANT: This can only be used for read-only access, as no coherency is
|
|
|
-enforced for the shadowed parts.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_vertical_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a 3D matrix along the Y dimension, thus getting (x,y/nparts,z)
|
|
|
-3D matrices. If nparts does not divide y, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_vertical_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a 3D matrix along the Y dimension, with a shadow border
|
|
|
-@code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow,z) 3D
|
|
|
-matrices. If nparts does not divide y, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-
|
|
|
-IMPORTANT: This can only be used for read-only access, as no coherency is
|
|
|
-enforced for the shadowed parts.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_depth_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a 3D matrix along the Z dimension, thus getting (x,y,z/nparts)
|
|
|
-3D matrices. If nparts does not divide z, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_depth_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a 3D matrix along the Z dimension, with a shadow border
|
|
|
-@code{filter_arg_ptr}, thus getting (x,y,(z-2*shadow)/nparts+2*shadow)
|
|
|
-3D matrices. If nparts does not divide z, the last submatrix contains the
|
|
|
-remainder.
|
|
|
-
|
|
|
-IMPORTANT: This can only be used for read-only access, as no coherency is
|
|
|
-enforced for the shadowed parts.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@node Partitioning BCSR Data
|
|
|
-@subsubsection Partitioning BCSR Data
|
|
|
-
|
|
|
-@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a block-sparse matrix into dense matrices.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
|
|
|
-This partitions a block-sparse matrix into vertical block-sparse matrices.
|
|
|
+This partitions a 3D matrix along the X axis.
|
|
|
@end deftypefun
|
|
|
|
|
|
@node Codelets and Tasks
|
|
@@ -1402,9 +1213,7 @@ always only define the field @code{opencl_funcs}.
|
|
|
|
|
|
@deftp {Data Type} {struct starpu_codelet}
|
|
|
The codelet structure describes a kernel that is possibly implemented on various
|
|
|
-targets. For compatibility, make sure to initialize the whole structure to zero,
|
|
|
-either by using explicit memset, or by letting the compiler implicitly do it in
|
|
|
-e.g. static storage case.
|
|
|
+targets. For compatibility, make sure to initialize the whole structure to zero.
|
|
|
|
|
|
@table @asis
|
|
|
@item @code{uint32_t where} (optional)
|
|
@@ -1420,9 +1229,7 @@ unset, its value will be automatically set based on the availability
|
|
|
of the @code{XXX_funcs} fields defined below.
|
|
|
|
|
|
@item @code{int (*can_execute)(unsigned workerid, struct starpu_task *task, unsigned nimpl)} (optional)
|
|
|
-Defines a function which should return 1 if the worker designated by
|
|
|
-@var{workerid} can execute the @var{nimpl}th implementation of the
|
|
|
-given @var{task}, 0 otherwise.
|
|
|
+Defines a function which should return 1 if the worker designated by @var{workerid} can execute the @var{nimpl}th implementation of the given@var{task}, 0 otherwise.
|
|
|
|
|
|
@item @code{enum starpu_codelet_type type} (optional)
|
|
|
The default is @code{STARPU_SEQ}, i.e. usual sequential implementation. Other
|
|
@@ -1504,13 +1311,11 @@ option when configuring StarPU.
|
|
|
|
|
|
@item @code{struct starpu_perfmodel *model} (optional)
|
|
|
This is a pointer to the task duration performance model associated to this
|
|
|
-codelet. This optional field is ignored when set to @code{NULL} or
|
|
|
-when its @code{symbol} field is not set.
|
|
|
+codelet. This optional field is ignored when set to @code{NULL}.
|
|
|
|
|
|
@item @code{struct starpu_perfmodel *power_model} (optional)
|
|
|
This is a pointer to the task power consumption performance model associated
|
|
|
-to this codelet. This optional field is ignored when set to
|
|
|
-@code{NULL} or when its @code{symbol} field is not set.
|
|
|
+to this codelet. This optional field is ignored when set to @code{NULL}.
|
|
|
In the case of parallel codelets, this has to account for all processing units
|
|
|
involved in the parallel execution.
|
|
|
|
|
@@ -1609,10 +1414,9 @@ codelets, where the @code{cl_arg} pointer is given as such.
|
|
|
@item @code{void (*callback_func)(void *)} (optional) (default: @code{NULL})
|
|
|
This is a function pointer of prototype @code{void (*f)(void *)} which
|
|
|
specifies a possible callback. If this pointer is non-null, the callback
|
|
|
-function is executed @emph{on the host} after the execution of the task. Tasks
|
|
|
-which depend on it might already be executing. The callback is passed the
|
|
|
-value contained in the @code{callback_arg} field. No callback is executed if the
|
|
|
-field is set to @code{NULL}.
|
|
|
+function is executed @emph{on the host} after the execution of the task. The
|
|
|
+callback is passed the value contained in the @code{callback_arg} field. No
|
|
|
+callback is executed if the field is set to @code{NULL}.
|
|
|
|
|
|
@item @code{void *callback_arg} (optional) (default: @code{NULL})
|
|
|
This is the pointer passed to the callback function. This field is ignored if
|
|
@@ -1716,7 +1520,7 @@ submitted if it has not been properly initialized.
|
|
|
Initialize @var{task} with default values. This function is implicitly
|
|
|
called by @code{starpu_task_create}. By default, tasks initialized with
|
|
|
@code{starpu_task_init} must be deinitialized explicitly with
|
|
|
-@code{starpu_task_clean}. Tasks can also be initialized statically,
|
|
|
+@code{starpu_task_deinit}. Tasks can also be initialized statically,
|
|
|
using @code{STARPU_TASK_INITIALIZER} defined below.
|
|
|
@end deftypefun
|
|
|
|
|
@@ -1737,14 +1541,11 @@ by the task have to be freed by calling
|
|
|
@code{starpu_task_destroy}.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun void starpu_task_clean ({struct starpu_task} *@var{task})
|
|
|
+@deftypefun void starpu_task_deinit ({struct starpu_task} *@var{task})
|
|
|
Release all the structures automatically allocated to execute @var{task}, but
|
|
|
-not the task structure itself and values set by the user remain unchanged.
|
|
|
-It is thus useful for statically allocated tasks for instance.
|
|
|
-It is also useful when the user wants to execute the same operation several
|
|
|
-times with as least overhead as possible.
|
|
|
-It is called automatically by @code{starpu_task_destroy}.
|
|
|
-It has to be called only after explicitly waiting for the task or after
|
|
|
+not the task structure itself. It is thus useful for statically allocated tasks
|
|
|
+for instance. It is called automatically by @code{starpu_task_destroy}. It
|
|
|
+has to be called only after explicitly waiting for the task or after
|
|
|
@code{starpu_shutdown} (waiting for the callback is not enough, since starpu
|
|
|
still manipulates the task after calling the callback).
|
|
|
@end deftypefun
|
|
@@ -1781,10 +1582,6 @@ function for instance.
|
|
|
In case of success, this function returns 0, a return value of @code{-ENODEV}
|
|
|
means that there is no worker able to process this task (e.g. there is no GPU
|
|
|
available and this task is only implemented for CUDA devices).
|
|
|
-
|
|
|
-starpu_task_submit() can be called from anywhere, including codelet
|
|
|
-functions and callbacks, provided that the @code{synchronous} field of the
|
|
|
-@code{starpu_task} structure is left to 0.
|
|
|
@end deftypefun
|
|
|
|
|
|
@deftypefun int starpu_task_wait_for_all (void)
|
|
@@ -1889,14 +1686,6 @@ This function is similar to @code{starpu_tag_wait} except that it blocks until
|
|
|
terminated.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun void starpu_tag_restart (starpu_tag_t @var{id})
|
|
|
-This function can be used to clear the "already notified" status
|
|
|
-of a tag which is not associated with a task. Before that, calling
|
|
|
-@code{starpu_tag_notify_from_apps} again will not notify the successors. After
|
|
|
-that, the next call to @code{starpu_tag_notify_from_apps} will notify the
|
|
|
-successors.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@deftypefun void starpu_tag_remove (starpu_tag_t @var{id})
|
|
|
This function releases the resources associated to tag @var{id}. It can be
|
|
|
called once the corresponding task has been executed and when there is
|
|
@@ -1908,10 +1697,7 @@ This function explicitly unlocks tag @var{id}. It may be useful in the
|
|
|
case of applications which execute part of their computation outside StarPU
|
|
|
tasks (e.g. third-party libraries). It is also provided as a
|
|
|
convenient tool for the programmer, for instance to entirely construct the task
|
|
|
-DAG before actually giving StarPU the opportunity to execute the tasks. When
|
|
|
-called several times on the same tag, notification will be done only on first
|
|
|
-call, thus implementing "OR" dependencies, until the tag is restarted using
|
|
|
-@code{starpu_tag_restart}.
|
|
|
+DAG before actually giving StarPU the opportunity to execute the tasks.
|
|
|
@end deftypefun
|
|
|
|
|
|
@node Implicit Data Dependencies
|
|
@@ -1976,11 +1762,7 @@ The possible values are:
|
|
|
@anchor{struct starpu_perfmodel}
|
|
|
contains all information about a performance model. At least the
|
|
|
@code{type} and @code{symbol} fields have to be filled when defining a
|
|
|
-performance model for a codelet. For compatibility, make sure to initialize the
|
|
|
-whole structure to zero, either by using explicit memset, or by letting the
|
|
|
-compiler implicitly do it in e.g. static storage case.
|
|
|
-
|
|
|
-If not provided, other fields have to be zero.
|
|
|
+performance model for a codelet. If not provided, other fields have to be zero.
|
|
|
|
|
|
@table @asis
|
|
|
@item @code{type}
|
|
@@ -1995,8 +1777,7 @@ archs will be determined by multiplying by an arch-specific factor.
|
|
|
|
|
|
@item @code{const char *symbol}
|
|
|
is the symbol name for the performance model, which will be used as
|
|
|
-file name to store the model. It must be set otherwise the model will
|
|
|
-be ignored.
|
|
|
+file name to store the model.
|
|
|
|
|
|
@item @code{double (*cost_model)(struct starpu_buffer_descr *)}
|
|
|
This field is deprecated. Use instead the @code{cost_function} field.
|
|
@@ -2011,7 +1792,7 @@ Used by @code{STARPU_HISTORY_BASED} and
|
|
|
implementation number, and returns the size to be used as index for
|
|
|
history and regression.
|
|
|
|
|
|
-@item @code{struct starpu_perfmodel_per_arch per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS]}
|
|
|
+@item @code{struct starpu_per_arch_perfmodel per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS]}
|
|
|
Used by @code{STARPU_PER_ARCH}: array of @code{struct
|
|
|
starpu_per_arch_perfmodel} structures.
|
|
|
|
|
@@ -2028,7 +1809,7 @@ Lock to protect concurrency between loading from disk (W), updating the values
|
|
|
@end table
|
|
|
@end deftp
|
|
|
|
|
|
-@deftp {Data Type} {struct starpu_perfmodel_regression_model}
|
|
|
+@deftp {Data Type} {struct starpu_regression_model}
|
|
|
@table @asis
|
|
|
@item @code{double sumlny} sum of ln(measured)
|
|
|
@item @code{double sumlnx} sum of ln(size)
|
|
@@ -2045,7 +1826,7 @@ Lock to protect concurrency between loading from disk (W), updating the values
|
|
|
@end table
|
|
|
@end deftp
|
|
|
|
|
|
-@deftp {Data Type} {struct starpu_perfmodel_per_arch}
|
|
|
+@deftp {Data Type} {struct starpu_per_arch_perfmodel}
|
|
|
contains information about the performance model of a given arch.
|
|
|
|
|
|
@table @asis
|
|
@@ -2066,11 +1847,11 @@ case it depends on the architecture-specific implementation.
|
|
|
@item @code{struct starpu_htbl32_node *history}
|
|
|
The history of performance measurements.
|
|
|
|
|
|
-@item @code{struct starpu_perfmodel_history_list *list}
|
|
|
+@item @code{struct starpu_history_list *list}
|
|
|
Used by @code{STARPU_HISTORY_BASED} and @code{STARPU_NL_REGRESSION_BASED},
|
|
|
records all execution history measures.
|
|
|
|
|
|
-@item @code{struct starpu_perfmodel_regression_model regression}
|
|
|
+@item @code{struct starpu_regression_model regression}
|
|
|
Used by @code{STARPU_HISTORY_REGRESION_BASED} and
|
|
|
@code{STARPU_NL_REGRESSION_BASED}, contains the estimated factors of the
|
|
|
regression.
|
|
@@ -2078,7 +1859,7 @@ regression.
|
|
|
@end table
|
|
|
@end deftp
|
|
|
|
|
|
-@deftypefun int starpu_perfmodel_load_symbol ({const char} *@var{symbol}, {struct starpu_perfmodel} *@var{model})
|
|
|
+@deftypefun int starpu_load_history_debug ({const char} *@var{symbol}, {struct starpu_perfmodel} *@var{model})
|
|
|
loads a given performance model. The @var{model} structure has to be completely zero, and will be filled with the information saved in @code{~/.starpu}.
|
|
|
@end deftypefun
|
|
|
|
|
@@ -2090,42 +1871,22 @@ returns the path to the debugging information for the performance model.
|
|
|
returns the architecture name for @var{arch}.
|
|
|
@end deftypefun
|
|
|
|
|
|
+@deftypefun void starpu_force_bus_sampling (void)
|
|
|
+forces sampling the bus performance model again.
|
|
|
+@end deftypefun
|
|
|
+
|
|
|
@deftypefun {enum starpu_perf_archtype} starpu_worker_get_perf_archtype (int @var{workerid})
|
|
|
returns the architecture type of a given worker.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun int starpu_perfmodel_list ({FILE *}@var{output})
|
|
|
+@deftypefun int starpu_list_models ({FILE *}@var{output})
|
|
|
prints a list of all performance models on @var{output}.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun void starpu_perfmodel_print ({struct starpu_perfmodel *}@var{model}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl}, {char *}@var{parameter}, {uint32_t *}footprint, {FILE *}@var{output})
|
|
|
-todo
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun int starpu_perfmodel_print_all ({struct starpu_perfmodel *}@var{model}, {char *}@var{arch}, @var{char *}parameter, {uint32_t *}@var{footprint}, {FILE *}@var{output})
|
|
|
-todo
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@deftypefun void starpu_bus_print_bandwidth ({FILE *}@var{f})
|
|
|
prints a matrix of bus bandwidths on @var{f}.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun void starpu_bus_print_affinity ({FILE *}@var{f})
|
|
|
-prints the affinity devices on @var{f}.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_topology_print ({FILE *}@var{f})
|
|
|
-prints a description of the topology on @var{f}.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_perfmodel_update_history ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{cpuid}, unsigned @var{nimpl}, double @var{measured});
|
|
|
-This feeds the performance model @var{model} with an explicit measurement
|
|
|
-@var{measured}, in addition to measurements done by StarPU itself. This can be
|
|
|
-useful when the application already has an existing set of measurements done
|
|
|
-in good conditions, that StarPU could benefit from instead of doing on-line
|
|
|
-measurements. And example of use can be see in @ref{Performance model example}.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@node Profiling API
|
|
|
@section Profiling API
|
|
|
|
|
@@ -2345,23 +2106,6 @@ Calls starpu_cuda_report_error, passing the current function, file and line
|
|
|
position.
|
|
|
@end defmac
|
|
|
|
|
|
-@deftypefun int starpu_cuda_copy_async_sync ({void *}@var{src_ptr}, unsigned @var{src_node}, {void *}@var{dst_ptr}, unsigned @var{dst_node}, size_t @var{ssize}, cudaStream_t @var{stream}, {enum cudaMemcpyKind} @var{kind})
|
|
|
-Copy @var{ssize} bytes from the pointer @var{src_ptr} on
|
|
|
-@var{src_node} to the pointer @var{dst_ptr} on @var{dst_node}.
|
|
|
-The function first tries to copy the data asynchronous (unless
|
|
|
-@var{stream} is @code{NULL}. If the asynchronous copy fails or if
|
|
|
-@var{stream} is @code{NULL}, it copies the data synchronously.
|
|
|
-The function returns @code{-EAGAIN} if the asynchronous copy was
|
|
|
-successfull. It returns 0 if the synchronous copy was successful, or
|
|
|
-fails otherwise.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_cuda_set_device (int @var{devid})
|
|
|
-Calls @code{cudaSetDevice(devid)} or @code{cudaGLSetGLDevice(devid)}, according to
|
|
|
-whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
|
|
|
-the @code{starpu_conf} structure.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@deftypefun void starpu_helper_cublas_init (void)
|
|
|
This function initializes CUBLAS on every CUDA device.
|
|
|
The CUBLAS library must be initialized prior to any CUBLAS call. Calling
|
|
@@ -2431,23 +2175,8 @@ Return the computation kernel command queue of the current worker.
|
|
|
Sets the arguments of a given kernel. The list of arguments must be given as
|
|
|
(size_t @var{size_of_the_argument}, cl_mem * @var{pointer_to_the_argument}).
|
|
|
The last argument must be 0. Returns the number of arguments that were
|
|
|
-successfully set. In case of failure, returns the id of the argument
|
|
|
-that could not be set and @var{err} is set to the error returned by
|
|
|
-OpenCL. Otherwise, returns the number of arguments that were set.
|
|
|
-
|
|
|
-@cartouche
|
|
|
-@smallexample
|
|
|
-int n;
|
|
|
-cl_int err;
|
|
|
-cl_kernel kernel;
|
|
|
-n = starpu_opencl_set_kernel_args(&err, 2, &kernel,
|
|
|
- sizeof(foo), &foo,
|
|
|
- sizeof(bar), &bar,
|
|
|
- 0);
|
|
|
-if (n != 2)
|
|
|
- fprintf(stderr, "Error : %d\n", err);
|
|
|
-@end smallexample
|
|
|
-@end cartouche
|
|
|
+successfully set. In case of failure, @var{err} is set to the error returned by
|
|
|
+OpenCL.
|
|
|
@end deftypefun
|
|
|
|
|
|
@node Compiling OpenCL kernels
|
|
@@ -2483,43 +2212,6 @@ This function compiles an OpenCL source code stored in a string.
|
|
|
This function unloads an OpenCL compiled code.
|
|
|
@end deftypefun
|
|
|
|
|
|
-@deftypefun void starpu_opencl_load_program_source ({const char *}@var{source_file_name}, char *@var{located_file_name}, char *@var{located_dir_name}, char *@var{opencl_program_source})
|
|
|
-Store the contents of the file @var{source_file_name} in the buffer
|
|
|
-@var{opencl_program_source}. The file @var{source_file_name} can be
|
|
|
-located in the current directory, or in the directory specified by the
|
|
|
-environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, or in the
|
|
|
-directory @code{share/starpu/opencl} of the installation directory of
|
|
|
-StarPU, or in the source directory of StarPU.
|
|
|
-When the file is found, @code{located_file_name} is the full name of
|
|
|
-the file as it has been located on the system, @code{located_dir_name}
|
|
|
-the directory where it has been located. Otherwise, they are both set
|
|
|
-to the empty string.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun int starpu_opencl_compile_opencl_from_file ({const char *}@var{source_file_name}, {const char*} @var{build_options})
|
|
|
-Compile the OpenCL kernel stored in the file @code{source_file_name}
|
|
|
-with the given options @code{build_options} and stores the result in
|
|
|
-the directory @code{$STARPU_HOME/.starpu/opencl} with the same
|
|
|
-filename as @code{source_file_name}. The compilation is done for every
|
|
|
-OpenCL device, and the filename is suffixed with the vendor id and the
|
|
|
-device id of the OpenCL device.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun int starpu_opencl_compile_opencl_from_string ({const char *}@var{opencl_program_source}, {const char *}@var{file_name}, {const char* }@var{build_options})
|
|
|
-Compile the OpenCL kernel in the string @code{opencl_program_source}
|
|
|
-with the given options @code{build_options} and stores the result in
|
|
|
-the directory @code{$STARPU_HOME/.starpu/opencl} with the filename
|
|
|
-@code{file_name}. The compilation is done for every
|
|
|
-OpenCL device, and the filename is suffixed with the vendor id and the
|
|
|
-device id of the OpenCL device.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun int starpu_opencl_load_binary_opencl ({const char *}@var{kernel_id}, {struct starpu_opencl_program *}@var{opencl_programs})
|
|
|
-Compile the binary OpenCL kernel identified with @var{id}. For every
|
|
|
-OpenCL device, the binary OpenCL kernel will be loaded from the file
|
|
|
-@code{$STARPU_HOME/.starpu/opencl/<kernel_id>.<device_type>.vendor_id_<vendor_id>_device_id_<device_id>}.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@node Loading OpenCL kernels
|
|
|
@subsection Loading OpenCL kernels
|
|
|
|
|
@@ -2546,11 +2238,6 @@ collect statistics about the kernel execution (used cycles, consumed power).
|
|
|
@node OpenCL utilities
|
|
|
@subsection OpenCL utilities
|
|
|
|
|
|
-@deftypefun {const char *} starpu_opencl_error_string (cl_int @var{status})
|
|
|
-Return the error message in English corresponding to @var{status}, an
|
|
|
-OpenCL error code.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
@deftypefun void starpu_opencl_display_error ({const char *}@var{func}, {const char *}@var{file}, int @var{line}, {const char *}@var{msg}, cl_int @var{status})
|
|
|
Given a valid error @var{status}, prints the corresponding error message on
|
|
|
stdout, along with the given function name @var{func}, the given filename
|