12 年之前 · 1191dbb599
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -13,7 +13,6 @@
 
				 info_TEXINFOS = starpu.texi
			
 
				 
			
 
				 chapters =	chapters/advanced-api.texi \
			
 
				-	chapters/benchmarks.texi \
			
 
				 	chapters/configuration.texi \
			
 
				 	chapters/perf-feedback.texi \
			
 
				 	chapters/vector_scal_cpu.texi \
			
--- a/doc/chapters/benchmarks.texi
+++ b/doc/chapters/benchmarks.texi
@@ -1,47 +0,0 @@
 
				-@c -*-texinfo-*-
			
 
				-
			
 
				-@c This file is part of the StarPU Handbook.
			
 
				-@c Copyright (C) 2012  University of Bordeaux
			
 
				-@c See the file starpu.texi for copying conditions.
			
 
				-
			
 
				-@menu
			
 
				-* Task size overhead::           Overhead of tasks depending on their size
			
 
				-* Data transfer latency::        Latency of data transfers
			
 
				-* Gemm::                         Matrix-matrix multiplication
			
 
				-* Cholesky::                     Cholesky factorization
			
 
				-* LU::                           LU factorization
			
 
				-@end menu
			
 
				-
			
 
				-Some interesting benchmarks are installed among examples in
			
 
				-/usr/lib/starpu/examples . Make sure to try various schedulers, for instance
			
 
				-STARPU_SCHED=dmda
			
 
				-
			
 
				-@node Task size overhead
			
 
				-@section Task size overhead
			
 
				-
			
 
				-This benchmark gives a glimpse into how big a size should be for StarPU overhead
			
 
				-to be low enough.  Run @code{tasks_size_overhead.sh}, it will generate a plot
			
 
				-of the speedup of tasks of various sizes, depending on the number of CPUs being
			
 
				-used.
			
 
				-
			
 
				-@node Data transfer latency
			
 
				-@section Data transfer latency
			
 
				-
			
 
				-@code{local_pingpong} performs a ping-pong between the first two CUDA nodes, and
			
 
				-prints the measured latency.
			
 
				-
			
 
				-@node Gemm
			
 
				-@section Matrix-matrix multiplication
			
 
				-
			
 
				-@code{sgemm} and @code{dgemm} perform a blocked matrix-matrix
			
 
				-multiplication using BLAS and cuBLAS. They output the obtained GFlops.
			
 
				-
			
 
				-@node Cholesky
			
 
				-@section Cholesky factorization
			
 
				-
			
 
				-@code{cholesky*} perform a Cholesky factorization (single precision). They use different dependency primitives.
			
 
				-
			
 
				-@node LU
			
 
				-@section LU factorization
			
 
				-
			
 
				-@code{lu*} perform an LU factorization. They use different dependency primitives.
			
--- a/doc/chapters/installing.texi
+++ b/doc/chapters/installing.texi
@@ -10,6 +10,7 @@ w@c -*-texinfo-*-
 
				 * Installing a Binary Package::
			
 
				 * Installing from Source::
			
 
				 * Setting up Your Own Code::
			
 
				+* Benchmarking StarPU::
			
 
				 @end menu
			
 
				 
			
 
				 @node Installing a Binary Package
			
@@ -287,3 +288,48 @@ so:
 
				 @example
			
 
				 $ STARPU_NCUDA=2 ./application
			
 
				 @end example
			
 
				+
			
 
				+@node Benchmarking StarPU
			
 
				+@section Benchmarking StarPU
			
 
				+
			
 
				+Some interesting benchmarks are installed among examples in
			
 
				+@code{$prefix_dir/lib/starpu/examples/}. Make sure to try various
			
 
				+schedulers, for instance STARPU_SCHED=dmda
			
 
				+
			
 
				+@menu
			
 
				+* Task size overhead::
			
 
				+* Data transfer latency::
			
 
				+* Gemm::
			
 
				+* Cholesky::
			
 
				+* LU::
			
 
				+@end menu
			
 
				+
			
 
				+@node Task size overhead
			
 
				+@subsection Task size overhead
			
 
				+
			
 
				+This benchmark gives a glimpse into how big a size should be for StarPU overhead
			
 
				+to be low enough.  Run @code{tasks_size_overhead.sh}, it will generate a plot
			
 
				+of the speedup of tasks of various sizes, depending on the number of CPUs being
			
 
				+used.
			
 
				+
			
 
				+@node Data transfer latency
			
 
				+@subsection Data transfer latency
			
 
				+
			
 
				+@code{local_pingpong} performs a ping-pong between the first two CUDA nodes, and
			
 
				+prints the measured latency.
			
 
				+
			
 
				+@node Gemm
			
 
				+@subsection Matrix-matrix multiplication
			
 
				+
			
 
				+@code{sgemm} and @code{dgemm} perform a blocked matrix-matrix
			
 
				+multiplication using BLAS and cuBLAS. They output the obtained GFlops.
			
 
				+
			
 
				+@node Cholesky
			
 
				+@subsection Cholesky factorization
			
 
				+
			
 
				+@code{cholesky*} perform a Cholesky factorization (single precision). They use different dependency primitives.
			
 
				+
			
 
				+@node LU
			
 
				+@subsection LU factorization
			
 
				+
			
 
				+@code{lu*} perform an LU factorization. They use different dependency primitives.
			
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -69,7 +69,6 @@ was last updated on @value{UPDATED}.
 
				 * Building and Installing StarPU::
			
 
				 * Basic Examples::              Basic examples of the use of StarPU
			
 
				 * Advanced Examples::           Advanced examples of the use of StarPU
			
 
				-* Benchmarks::                  Benchmarks worth running
			
 
				 * Performance optimization::    How to optimize performance with StarPU
			
 
				 * Performance feedback::        Performance debugging tools
			
 
				 * Tips and Tricks::             Tips and tricks to know about
			
@@ -123,14 +122,6 @@ was last updated on @value{UPDATED}.
 
				 @include chapters/advanced-examples.texi
			
 
				 
			
 
				 @c ---------------------------------------------------------------------
			
 
				-@c Benchmarks
			
 
				-@c ---------------------------------------------------------------------
			
 
				-
			
 
				-@node Benchmarks
			
 
				-@chapter Benchmarks
			
 
				-@include chapters/benchmarks.texi
			
 
				-
			
 
				-@c ---------------------------------------------------------------------
			
 
				 @c Performance options
			
 
				 @c ---------------------------------------------------------------------