Browse Source

mlr: completing R analysis and adapting examples (also fixing old paje_summary mistake)

Luka Stanisic 8 years ago
parent
commit
87e578f7c0

+ 65 - 24
examples/basic_examples/mlr.c

@@ -16,20 +16,30 @@
  */
 
 /*
- * This examples demonstrates how to use multiple linear regression models.
-
-   The duration of the task test_mlr will
-   be computed using the following equation:
+ * This examples demonstrates how to use multiple linear regression
+   models.
+
+   First, there is cl_model_init codelet for which we know the
+   parameters, but not the their exponents and relations. This tasks
+   should be benchmarked and analyzed to find the model, using
+   "tools/starpu_mlr_analysis" script as a template. Before the model
+   is defined by the application developer, the default model is
+   automatically computed. This default models is a simple constant
+   (thus making STARPU_MULTIPLE_REGRESSION_BASED model equal to the
+   history based model).
+
+   For the second (codelet cl_model_final), it is assumed that the
+   analysis has already been performend and that he duration of the
+   task test_mlr will be computed using the following equation:
 
    T = a + b * (M^2*N) + c * (N^3*K)
 
-   where M, N, K are the parameters of the task,
-   exponents are coming from cl.model->combinations[..][..] 
-   and finally a, b, c are coefficients
-   which mostly depend on the machine speed. 
+   where M, N, K are the parameters of the task, exponents are coming
+   from model->combinations[..][..]  and finally a, b, c are
+   coefficients which mostly depend on the machine speed.
    
-   These coefficients are going to be automatically computed	
-   using least square method.
+   These coefficients are going to be automatically computed using
+   least square method.
 
  */
 
@@ -69,21 +79,37 @@ void cpu_func(void *buffers[], void *cl_arg)
 /* ############################################ */
 /* Start of the part specific to multiple linear regression perfmodels */
 
-/* Defining perfmodel, number of parameters and their names  */
-
-/* Defining the equation for modeling duration of the task */
-/* Refer to the explanation and equation on the top of this file
-   to get more detailed explanation, here we have M^2*N and N^3*K */
+/* Defining perfmodel, number of parameters and their names Initially
+   application developer only knows these parameters. The execution of
+   this codelet will generate traces that can be analyzed using
+   "tools/starpu_mlr_analysis" as a template to obtain the parameters
+   combinations and exponents.
+ */
 
 static const char * parameters_names[]	= {	"M",	"N",	"K", };
+
+static struct starpu_perfmodel cl_model_init = {
+	.type = STARPU_MULTIPLE_REGRESSION_BASED,
+	.symbol = "mlr_init",
+	.parameters = cl_params,
+	.nparameters = 3,
+	.parameters_names = parameters_names,
+};
+
+/* Defining the equation for modeling duration of the task. The
+   parameters combinations and exponents are computed externally
+   offline, for example using "tools/starpu_mlr_analysis" tool as a
+   template.
+ */
+
 static unsigned combi1 [3]		= {	2,	1,	0 };
 static unsigned combi2 [3]		= {	0,	3,	1 };
 
 static unsigned *combinations[] = { combi1, combi2 };
 
-static struct starpu_perfmodel cl_model = {
+static struct starpu_perfmodel cl_model_final = {
 	.type = STARPU_MULTIPLE_REGRESSION_BASED,
-	.symbol = "test_mlr",
+	.symbol = "mlr_final",
 	.parameters = cl_params,
 	.nparameters = 3,
 	.parameters_names = parameters_names,
@@ -91,16 +117,24 @@ static struct starpu_perfmodel cl_model = {
 	.combinations = combinations,
 };
 
-static struct starpu_codelet cl = {
+/* End of the part specific to multiple linear regression perfmodels */
+/* ############################################ */
+
+static struct starpu_codelet cl_init = {
 	.cpu_funcs = { cpu_func },
-	.cpu_funcs_name = { "mlr_codelet" },
+	.cpu_funcs_name = { "mlr_codelet_init" },
 	.nbuffers = 0,
-	.model = &cl_model,
+	.model = &cl_model_init,
 };
 
-/* End of the part specific to multiple linear regression perfmodels */
-/* ############################################ */
-	
+static struct starpu_codelet cl_final = {
+	.cpu_funcs = { cpu_func },
+	.cpu_funcs_name = { "mlr_codelet_final" },
+	.nbuffers = 0,
+	.model = &cl_model_final,
+};
+
+
 int main(int argc, char **argv)
 {
 	/* Initialization */
@@ -121,11 +155,18 @@ int main(int argc, char **argv)
 		k = (double) ((rand() % 10)+1);
 		
 		for(j=0; j < 42; j++)
-			starpu_insert_task(&cl,
+		{
+			starpu_insert_task(&cl_init,
+				   STARPU_VALUE, &m, sizeof(double),
+				   STARPU_VALUE, &n, sizeof(double),
+				   STARPU_VALUE, &k, sizeof(double),
+				   0);
+			starpu_insert_task(&cl_final,
 				   STARPU_VALUE, &m, sizeof(double),
 				   STARPU_VALUE, &n, sizeof(double),
 				   STARPU_VALUE, &k, sizeof(double),
 				   0);
+		}
 	}
 			  
 	starpu_shutdown();

+ 6 - 1
tools/Makefile.am

@@ -135,6 +135,7 @@ EXTRA_DIST =				\
 	dev/rename.sed			\
 	dev/rename.sh			\
 	perfmodels/README		\
+	perfmodels/sampling/codelets/tmp/mlr_init.out	 \
 	valgrind/hwloc.suppr		\
 	valgrind/libc.suppr		\
 	valgrind/libgomp.suppr		\
@@ -155,7 +156,7 @@ EXTRA_DIST =				\
 	msvc/starpu.sln			\
 	msvc/starpu/starpu.vcxproj
 
-CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
+CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log figure/* mlr_*
 
 #####################################
 # What to install and what to check #
@@ -246,6 +247,10 @@ dist_bin_SCRIPTS +=			\
 	starpu_codelet_profile		\
 	starpu_paje_draw_histogram	\
 	starpu_paje_draw_histogram.R	\
+	starpu_paje_summary		\
+	starpu_paje_summary.Rmd		\
+	starpu_mlr_analysis		\
+	starpu_mlr_analysis.Rmd		\
 	starpu_paje_state_stats		\
 	starpu_trace_state_stats.py
 

File diff suppressed because it is too large
+ 1765 - 0
tools/perfmodels/sampling/codelets/tmp/mlr_init.out


+ 57 - 33
tools/starpu_mlr_analysis.Rmd

@@ -56,13 +56,22 @@ print_codelet <- function(reg,codelet){
 df<-read.csv(input_trace, header=TRUE)
 ```
 
-# Introduction
+# Multiple Linear Regression Model Example
 
-TODO
+## Introduction
+
+This document demonstrates the type of the analysis needed to compute
+the multiple linear regression model of the task. It relies on the
+input data benchmarked by the StarPU (or any other tool, but following
+the same format). The input data used in this example is generated by
+the task "mlr_init", from the "examples/basic_examples/mlr.c".
+
+This document can be used as an template for the analysis of any other
+task.
 
 ### How to compile
 
-    ./starpu_mlr_analysis .starpu/sampling/codelets/tmp/test_mlr.out
+    ./starpu_mlr_analysis .starpu/sampling/codelets/tmp/mlr_init.out
 
 ### Software dependencies
 
@@ -70,13 +79,16 @@ In order to run the analysis you need to have R installed:
 
     sudo apt-get install r-base 
 
-In order to compile this document, you need *knitr*. However, you can perfectly use the R code from this document without knitr in your own scripts. If you decided that you want to generate this document, then start R (e.g., from terminal) and install knitr package:
+In order to compile this document, you need *knitr* (although you can
+perfectly only use the R code from this document without knitr). If
+you decided that you want to generate this document, then start R
+(e.g., from terminal) and install knitr package:
 
     R> install.packages("knitr")
 
 No additional R packages are needed.
 
-# First glimpse
+## First glimpse at the data
 
 First, we show the relations between all parameters in a single plot.
 
@@ -87,12 +99,12 @@ plot(df)
 For this example, all three parameters M, N, K have some influence,
 but their relation is not easy to understand.
 
-In general, this type of plots can typically show if there is a group
-of parameters which are mutually perfectly correlated, in which case
-only a one parameter from the group should be kept for the further
-analysis. Additionally, plot can show the parameters that have a
-constant value, and since these cannot have an influence on the model,
-they should also be ignored.
+In general, this type of plots can typically show if there are
+outliers. It can also show if there is a group of parameters which are
+mutually perfectly correlated, in which case only a one parameter from
+the group should be kept for the further analysis. Additionally, plot
+can show the parameters that have a constant value, and since these
+cannot have an influence on the model, they should also be ignored.
 
 However, making conclusions based solely on the visual analysis can be
 treacherous and it is better to rely on the statistical tools. The
@@ -102,7 +114,7 @@ parameters. Therefore, this initial visual look should only be used to
 get a basic idea about the model, but all the parameters should be
 kept for now.
 
-# Initial model
+## Initial model
 
 At this point, an initial model is computed, using all the parameters,
 but not taking into account their exponents or the relations between
@@ -127,14 +139,13 @@ are not common to the multiple linear regression analysis and R tools,
 we suggest to the R documentation. Some explanations are also provided
 in the following article https://hal.inria.fr/hal-01180272.
        
-In this example, all parameters M, N, K are all very
-important. However, it is not clear if there are some relations
-between them or if some of these parameters should be used with an
-exponent. Moreover, adjusted R^2 value is not extremelly high and we
-hope we can get a better one. Thus, we proceed to the more advanced
-analysis.
+In this example, all parameters M, N, K are very important. However,
+it is not clear if there are some relations between them or if some of
+these parameters should be used with an exponent. Moreover, adjusted
+R^2 value is not extremely high and we hope we can get a better
+one. Thus, we proceed to the more advanced analysis.
 
-# Refining the model
+## Refining the model
 
 Now, we can seek for the relations between the parameters. Note that
 trying all the possible combinations for the cases with a huge number
@@ -148,9 +159,8 @@ model2 <- lm(data=df, Duration ~ M*N*K)
 summary(model2)
 ```
 
-This model is more accurate, as the R^2 value increased. Now when some
-relations are observed, we can try some of these parameters with the
-exponents.
+This model is more accurate, as the R^2 value increased. We can also
+try some of these parameters with the exponents.
 
 ```{r Model3}
 model3 <- lm(data=df, Duration ~ I(M^2)+I(M^3)+I(N^2)+I(N^3)+I(K^2)+I(K^3))
@@ -158,17 +168,19 @@ summary(model3)
 ```
 
 It seems like some parameters are important. Now we combine these and
-try to find the optimal combination.
+try to find the optimal combination (here we go directly to the final
+solution, although this process typically takes several iterations of
+trying different combinations).
 
 ```{r Model4}
 model4 <- lm(data=df, Duration ~ I(M^2):N+I(N^3):K)
 summary(model4)
 ```
 
-Depending on the machine characteristics and the variability of
-benchmarks, this may be the best model.
+This seems to be the most accurate model, with a high R^2 value. We
+can proceed to its validation.
 
-# Validation
+## Validation
 
 Once the model has been computed, we should validate it. Apart from
 the low adjusted R^2 value, the model weakness can also be observed
@@ -189,7 +201,7 @@ which is typical for a single experiment run with a homogeneous
 data. The fact that there is some variability is common, as executing
 exactly the same code on a real machine will always have slightly
 different duration. However, having a huge variability means that the
-benchmarks were very noisy, thus having an accurate models from them
+benchmarks were very noisy, thus deriving an accurate models from them
 will be hard.
 
 Plot on the right may show that the residuals do not follow the normal
@@ -198,19 +210,31 @@ predictive power.
 
 If we are not satisfied with the accuracy of the observed models, we
 should go back to the previous section and try to find a better
-one. In some cases, the benchmarked data will just be too noisy and
-they should be redesigned and run again.
+one. In some cases, the benchmarked data is just be too noisy or the
+choice of the parameters is not appropriate, and thus the experiments
+should be redesigned and rerun.
 
 When we are finally satisfied with the model accuracy, we should
 modify our task code, so that StarPU knows which parameters
 combinations are used in the model.
 
-# Generating C code
+## Generating C code
 
-This is a simple helper to generate C code which should be copied to
-the task description in your application. Make sure that the generated
-code correctly corresponds to computed model.
+Depending on the way the task codelet is programmed, this section may
+be somehow useful. This is a simple helper to generate C code for the
+parameters combinations and it should be copied to the task
+description in the application. The function generating the code is
+not so robust, so make sure that the generated code correctly
+corresponds to computed model (for example parameters are considered
+in the alphabetical order).
 
 ```{r Code}
 print_codelet(model4, "mlr_cl")
 ```
+
+## Conclusion
+
+We have computed the model for our benchmarked data using multiple
+linear regression. After encoding this model into the task code,
+StarPU will be able to automatically compute the coefficients and use
+the model to predict task duration.

+ 24 - 13
tools/starpu_mlr_analysis.in

@@ -2,8 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2014  Université Joseph Fourier
-# Copyright (C) 2014-2015  Université Bordeaux
+# Copyright (C) 2016  Inria
 # 
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -21,9 +20,10 @@
 set -e # fail fast
 
 # File names
-basename="$PWD"
+SOURCE_DIR=@abs_srcdir@
+
 outputfile="mlr_analysis.html"
-analysis_script="$(dirname $(which $0))/starpu_mlr_analysis.Rmd"
+analysis_script="$SOURCE_DIR/starpu_mlr_analysis.Rmd"
 
 # Command line arguments
 inputfile=""
@@ -31,13 +31,14 @@ inputfile=""
 help_script()
 {
 cat << EOF
-Give statistical analysis of the paje trace
+Give an example of the trace analysis for computing multiple linear regression model
 
 Options:
    -h      Show this message
 
 Examples:
 $0 .starpu/sampling/codelets/tmp/test_mlr.out
+$0 
 
 Report bugs to <@PACKAGE_BUGREPORT@>
 EOF
@@ -48,13 +49,13 @@ if [ "$1" = "--version" ] ; then
     exit 0
 fi
 
-if [ "$1" = "-h" ] || [ "$1" = "--help" ] || [ "$1" = "" ] ; then
+if [ "$1" = "-h" ] || [ "$1" = "--help" ] ; then
     help_script
     exit 0
 fi
 
 while getopts "h" opt; do
-  case $opt in
+    case $opt in
     \?)
       echo "Invalid option: -$OPTARG"
       help_script
@@ -66,12 +67,22 @@ done
 # Reading files that need to be analyzed
 shift $((OPTIND - 1))
 inputfile=$1
-# Error if there is no input files specified
-# if [[ $# != 1]]; then
-#     echo "Error!"
-#     help_script
-#     exit 2
-# fi
+if [[ $# < 1 ]]; then
+    inputfile="$SOURCE_DIR/perfmodels/sampling/codelets/tmp/mlr_init.out"
+else
+# Error if there is more than one input file
+    if [[ $# > 1 ]]; then
+	echo "Error!"
+	help_script
+	exit 2
+    fi
+fi
+
+if [ ! -s $inputfile ]
+    then
+	echo "Error: file $inputfile does not exist!"
+	exit 5
+fi
 
 #####################################
 # Running analysis file to get actual results

+ 3 - 2
tools/starpu_paje_summary.in

@@ -21,9 +21,10 @@
 set -e # fail fast
 
 # File names
-basename="$PWD"
+SOURCE_DIR=@abs_srcdir@
+
 outputfile="summary.html"
-analysis_script="$(dirname $(which $0))/starpu_paje_summary.Rmd"
+analysis_script="$SOURCE_DIR/starpu_paje_summary.Rmd"
 analysis_input=""
 
 # Command line arguments