Browse Source

mlr: completing R analysis and adapting examples (also fixing old paje_summary mistake)

Luka Stanisic 8 years ago
parent
commit
87e578f7c0

+ 65 - 24
examples/basic_examples/mlr.c

@@ -16,20 +16,30 @@
  */
  */
 
 
 /*
 /*
- * This examples demonstrates how to use multiple linear regression models.
+ * This examples demonstrates how to use multiple linear regression
-
+   models.
-   The duration of the task test_mlr will
+
-   be computed using the following equation:
+   First, there is cl_model_init codelet for which we know the
+   parameters, but not the their exponents and relations. This tasks
+   should be benchmarked and analyzed to find the model, using
+   "tools/starpu_mlr_analysis" script as a template. Before the model
+   is defined by the application developer, the default model is
+   automatically computed. This default models is a simple constant
+   (thus making STARPU_MULTIPLE_REGRESSION_BASED model equal to the
+   history based model).
+
+   For the second (codelet cl_model_final), it is assumed that the
+   analysis has already been performend and that he duration of the
+   task test_mlr will be computed using the following equation:
 
 
    T = a + b * (M^2*N) + c * (N^3*K)
    T = a + b * (M^2*N) + c * (N^3*K)
 
 
-   where M, N, K are the parameters of the task,
+   where M, N, K are the parameters of the task, exponents are coming
-   exponents are coming from cl.model->combinations[..][..] 
+   from model->combinations[..][..]  and finally a, b, c are
-   and finally a, b, c are coefficients
+   coefficients which mostly depend on the machine speed.
-   which mostly depend on the machine speed. 
    
    
-   These coefficients are going to be automatically computed	
+   These coefficients are going to be automatically computed using
-   using least square method.
+   least square method.
 
 
  */
  */
 
 
@@ -69,21 +79,37 @@ void cpu_func(void *buffers[], void *cl_arg)
 /* ############################################ */
 /* ############################################ */
 /* Start of the part specific to multiple linear regression perfmodels */
 /* Start of the part specific to multiple linear regression perfmodels */
 
 
-/* Defining perfmodel, number of parameters and their names  */
+/* Defining perfmodel, number of parameters and their names Initially
-
+   application developer only knows these parameters. The execution of
-/* Defining the equation for modeling duration of the task */
+   this codelet will generate traces that can be analyzed using
-/* Refer to the explanation and equation on the top of this file
+   "tools/starpu_mlr_analysis" as a template to obtain the parameters
-   to get more detailed explanation, here we have M^2*N and N^3*K */
+   combinations and exponents.
+ */
 
 
 static const char * parameters_names[]	= {	"M",	"N",	"K", };
 static const char * parameters_names[]	= {	"M",	"N",	"K", };
+
+static struct starpu_perfmodel cl_model_init = {
+	.type = STARPU_MULTIPLE_REGRESSION_BASED,
+	.symbol = "mlr_init",
+	.parameters = cl_params,
+	.nparameters = 3,
+	.parameters_names = parameters_names,
+};
+
+/* Defining the equation for modeling duration of the task. The
+   parameters combinations and exponents are computed externally
+   offline, for example using "tools/starpu_mlr_analysis" tool as a
+   template.
+ */
+
 static unsigned combi1 [3]		= {	2,	1,	0 };
 static unsigned combi1 [3]		= {	2,	1,	0 };
 static unsigned combi2 [3]		= {	0,	3,	1 };
 static unsigned combi2 [3]		= {	0,	3,	1 };
 
 
 static unsigned *combinations[] = { combi1, combi2 };
 static unsigned *combinations[] = { combi1, combi2 };
 
 
-static struct starpu_perfmodel cl_model = {
+static struct starpu_perfmodel cl_model_final = {
 	.type = STARPU_MULTIPLE_REGRESSION_BASED,
 	.type = STARPU_MULTIPLE_REGRESSION_BASED,
-	.symbol = "test_mlr",
+	.symbol = "mlr_final",
 	.parameters = cl_params,
 	.parameters = cl_params,
 	.nparameters = 3,
 	.nparameters = 3,
 	.parameters_names = parameters_names,
 	.parameters_names = parameters_names,
@@ -91,16 +117,24 @@ static struct starpu_perfmodel cl_model = {
 	.combinations = combinations,
 	.combinations = combinations,
 };
 };
 
 
-static struct starpu_codelet cl = {
+/* End of the part specific to multiple linear regression perfmodels */
+/* ############################################ */
+
+static struct starpu_codelet cl_init = {
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs = { cpu_func },
-	.cpu_funcs_name = { "mlr_codelet" },
+	.cpu_funcs_name = { "mlr_codelet_init" },
 	.nbuffers = 0,
 	.nbuffers = 0,
-	.model = &cl_model,
+	.model = &cl_model_init,
 };
 };
 
 
-/* End of the part specific to multiple linear regression perfmodels */
+static struct starpu_codelet cl_final = {
-/* ############################################ */
+	.cpu_funcs = { cpu_func },
-	
+	.cpu_funcs_name = { "mlr_codelet_final" },
+	.nbuffers = 0,
+	.model = &cl_model_final,
+};
+
+
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	/* Initialization */
 	/* Initialization */
@@ -121,11 +155,18 @@ int main(int argc, char **argv)
 		k = (double) ((rand() % 10)+1);
 		k = (double) ((rand() % 10)+1);
 		
 		
 		for(j=0; j < 42; j++)
 		for(j=0; j < 42; j++)
-			starpu_insert_task(&cl,
+		{
+			starpu_insert_task(&cl_init,
+				   STARPU_VALUE, &m, sizeof(double),
+				   STARPU_VALUE, &n, sizeof(double),
+				   STARPU_VALUE, &k, sizeof(double),
+				   0);
+			starpu_insert_task(&cl_final,
 				   STARPU_VALUE, &m, sizeof(double),
 				   STARPU_VALUE, &m, sizeof(double),
 				   STARPU_VALUE, &n, sizeof(double),
 				   STARPU_VALUE, &n, sizeof(double),
 				   STARPU_VALUE, &k, sizeof(double),
 				   STARPU_VALUE, &k, sizeof(double),
 				   0);
 				   0);
+		}
 	}
 	}
 			  
 			  
 	starpu_shutdown();
 	starpu_shutdown();

+ 6 - 1
tools/Makefile.am

@@ -135,6 +135,7 @@ EXTRA_DIST =				\
 	dev/rename.sed			\
 	dev/rename.sed			\
 	dev/rename.sh			\
 	dev/rename.sh			\
 	perfmodels/README		\
 	perfmodels/README		\
+	perfmodels/sampling/codelets/tmp/mlr_init.out	 \
 	valgrind/hwloc.suppr		\
 	valgrind/hwloc.suppr		\
 	valgrind/libc.suppr		\
 	valgrind/libc.suppr		\
 	valgrind/libgomp.suppr		\
 	valgrind/libgomp.suppr		\
@@ -155,7 +156,7 @@ EXTRA_DIST =				\
 	msvc/starpu.sln			\
 	msvc/starpu.sln			\
 	msvc/starpu/starpu.vcxproj
 	msvc/starpu/starpu.vcxproj
 
 
-CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
+CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log figure/* mlr_*
 
 
 #####################################
 #####################################
 # What to install and what to check #
 # What to install and what to check #
@@ -246,6 +247,10 @@ dist_bin_SCRIPTS +=			\
 	starpu_codelet_profile		\
 	starpu_codelet_profile		\
 	starpu_paje_draw_histogram	\
 	starpu_paje_draw_histogram	\
 	starpu_paje_draw_histogram.R	\
 	starpu_paje_draw_histogram.R	\
+	starpu_paje_summary		\
+	starpu_paje_summary.Rmd		\
+	starpu_mlr_analysis		\
+	starpu_mlr_analysis.Rmd		\
 	starpu_paje_state_stats		\
 	starpu_paje_state_stats		\
 	starpu_trace_state_stats.py
 	starpu_trace_state_stats.py
 
 

File diff suppressed because it is too large
+ 1765 - 0
tools/perfmodels/sampling/codelets/tmp/mlr_init.out


+ 57 - 33
tools/starpu_mlr_analysis.Rmd

@@ -56,13 +56,22 @@ print_codelet <- function(reg,codelet){
 df<-read.csv(input_trace, header=TRUE)
 df<-read.csv(input_trace, header=TRUE)
 ```
 ```
 
 
-# Introduction
+# Multiple Linear Regression Model Example
 
 
-TODO
+## Introduction
+
+This document demonstrates the type of the analysis needed to compute
+the multiple linear regression model of the task. It relies on the
+input data benchmarked by the StarPU (or any other tool, but following
+the same format). The input data used in this example is generated by
+the task "mlr_init", from the "examples/basic_examples/mlr.c".
+
+This document can be used as an template for the analysis of any other
+task.
 
 
 ### How to compile
 ### How to compile
 
 
-    ./starpu_mlr_analysis .starpu/sampling/codelets/tmp/test_mlr.out
+    ./starpu_mlr_analysis .starpu/sampling/codelets/tmp/mlr_init.out
 
 
 ### Software dependencies
 ### Software dependencies
 
 
@@ -70,13 +79,16 @@ In order to run the analysis you need to have R installed:
 
 
     sudo apt-get install r-base 
     sudo apt-get install r-base 
 
 
-In order to compile this document, you need *knitr*. However, you can perfectly use the R code from this document without knitr in your own scripts. If you decided that you want to generate this document, then start R (e.g., from terminal) and install knitr package:
+In order to compile this document, you need *knitr* (although you can
+perfectly only use the R code from this document without knitr). If
+you decided that you want to generate this document, then start R
+(e.g., from terminal) and install knitr package:
 
 
     R> install.packages("knitr")
     R> install.packages("knitr")
 
 
 No additional R packages are needed.
 No additional R packages are needed.
 
 
-# First glimpse
+## First glimpse at the data
 
 
 First, we show the relations between all parameters in a single plot.
 First, we show the relations between all parameters in a single plot.
 
 
@@ -87,12 +99,12 @@ plot(df)
 For this example, all three parameters M, N, K have some influence,
 For this example, all three parameters M, N, K have some influence,
 but their relation is not easy to understand.
 but their relation is not easy to understand.
 
 
-In general, this type of plots can typically show if there is a group
+In general, this type of plots can typically show if there are
-of parameters which are mutually perfectly correlated, in which case
+outliers. It can also show if there is a group of parameters which are
-only a one parameter from the group should be kept for the further
+mutually perfectly correlated, in which case only a one parameter from
-analysis. Additionally, plot can show the parameters that have a
+the group should be kept for the further analysis. Additionally, plot
-constant value, and since these cannot have an influence on the model,
+can show the parameters that have a constant value, and since these
-they should also be ignored.
+cannot have an influence on the model, they should also be ignored.
 
 
 However, making conclusions based solely on the visual analysis can be
 However, making conclusions based solely on the visual analysis can be
 treacherous and it is better to rely on the statistical tools. The
 treacherous and it is better to rely on the statistical tools. The
@@ -102,7 +114,7 @@ parameters. Therefore, this initial visual look should only be used to
 get a basic idea about the model, but all the parameters should be
 get a basic idea about the model, but all the parameters should be
 kept for now.
 kept for now.
 
 
-# Initial model
+## Initial model
 
 
 At this point, an initial model is computed, using all the parameters,
 At this point, an initial model is computed, using all the parameters,
 but not taking into account their exponents or the relations between
 but not taking into account their exponents or the relations between
@@ -127,14 +139,13 @@ are not common to the multiple linear regression analysis and R tools,
 we suggest to the R documentation. Some explanations are also provided
 we suggest to the R documentation. Some explanations are also provided
 in the following article https://hal.inria.fr/hal-01180272.
 in the following article https://hal.inria.fr/hal-01180272.
        
        
-In this example, all parameters M, N, K are all very
+In this example, all parameters M, N, K are very important. However,
-important. However, it is not clear if there are some relations
+it is not clear if there are some relations between them or if some of
-between them or if some of these parameters should be used with an
+these parameters should be used with an exponent. Moreover, adjusted
-exponent. Moreover, adjusted R^2 value is not extremelly high and we
+R^2 value is not extremely high and we hope we can get a better
-hope we can get a better one. Thus, we proceed to the more advanced
+one. Thus, we proceed to the more advanced analysis.
-analysis.
 
 
-# Refining the model
+## Refining the model
 
 
 Now, we can seek for the relations between the parameters. Note that
 Now, we can seek for the relations between the parameters. Note that
 trying all the possible combinations for the cases with a huge number
 trying all the possible combinations for the cases with a huge number
@@ -148,9 +159,8 @@ model2 <- lm(data=df, Duration ~ M*N*K)
 summary(model2)
 summary(model2)
 ```
 ```
 
 
-This model is more accurate, as the R^2 value increased. Now when some
+This model is more accurate, as the R^2 value increased. We can also
-relations are observed, we can try some of these parameters with the
+try some of these parameters with the exponents.
-exponents.
 
 
 ```{r Model3}
 ```{r Model3}
 model3 <- lm(data=df, Duration ~ I(M^2)+I(M^3)+I(N^2)+I(N^3)+I(K^2)+I(K^3))
 model3 <- lm(data=df, Duration ~ I(M^2)+I(M^3)+I(N^2)+I(N^3)+I(K^2)+I(K^3))
@@ -158,17 +168,19 @@ summary(model3)
 ```
 ```
 
 
 It seems like some parameters are important. Now we combine these and
 It seems like some parameters are important. Now we combine these and
-try to find the optimal combination.
+try to find the optimal combination (here we go directly to the final
+solution, although this process typically takes several iterations of
+trying different combinations).
 
 
 ```{r Model4}
 ```{r Model4}
 model4 <- lm(data=df, Duration ~ I(M^2):N+I(N^3):K)
 model4 <- lm(data=df, Duration ~ I(M^2):N+I(N^3):K)
 summary(model4)
 summary(model4)
 ```
 ```
 
 
-Depending on the machine characteristics and the variability of
+This seems to be the most accurate model, with a high R^2 value. We
-benchmarks, this may be the best model.
+can proceed to its validation.
 
 
-# Validation
+## Validation
 
 
 Once the model has been computed, we should validate it. Apart from
 Once the model has been computed, we should validate it. Apart from
 the low adjusted R^2 value, the model weakness can also be observed
 the low adjusted R^2 value, the model weakness can also be observed
@@ -189,7 +201,7 @@ which is typical for a single experiment run with a homogeneous
 data. The fact that there is some variability is common, as executing
 data. The fact that there is some variability is common, as executing
 exactly the same code on a real machine will always have slightly
 exactly the same code on a real machine will always have slightly
 different duration. However, having a huge variability means that the
 different duration. However, having a huge variability means that the
-benchmarks were very noisy, thus having an accurate models from them
+benchmarks were very noisy, thus deriving an accurate models from them
 will be hard.
 will be hard.
 
 
 Plot on the right may show that the residuals do not follow the normal
 Plot on the right may show that the residuals do not follow the normal
@@ -198,19 +210,31 @@ predictive power.
 
 
 If we are not satisfied with the accuracy of the observed models, we
 If we are not satisfied with the accuracy of the observed models, we
 should go back to the previous section and try to find a better
 should go back to the previous section and try to find a better
-one. In some cases, the benchmarked data will just be too noisy and
+one. In some cases, the benchmarked data is just be too noisy or the
-they should be redesigned and run again.
+choice of the parameters is not appropriate, and thus the experiments
+should be redesigned and rerun.
 
 
 When we are finally satisfied with the model accuracy, we should
 When we are finally satisfied with the model accuracy, we should
 modify our task code, so that StarPU knows which parameters
 modify our task code, so that StarPU knows which parameters
 combinations are used in the model.
 combinations are used in the model.
 
 
-# Generating C code
+## Generating C code
 
 
-This is a simple helper to generate C code which should be copied to
+Depending on the way the task codelet is programmed, this section may
-the task description in your application. Make sure that the generated
+be somehow useful. This is a simple helper to generate C code for the
-code correctly corresponds to computed model.
+parameters combinations and it should be copied to the task
+description in the application. The function generating the code is
+not so robust, so make sure that the generated code correctly
+corresponds to computed model (for example parameters are considered
+in the alphabetical order).
 
 
 ```{r Code}
 ```{r Code}
 print_codelet(model4, "mlr_cl")
 print_codelet(model4, "mlr_cl")
 ```
 ```
+
+## Conclusion
+
+We have computed the model for our benchmarked data using multiple
+linear regression. After encoding this model into the task code,
+StarPU will be able to automatically compute the coefficients and use
+the model to predict task duration.

+ 24 - 13
tools/starpu_mlr_analysis.in

@@ -2,8 +2,7 @@
 
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
 # 
-# Copyright (C) 2014  Université Joseph Fourier
+# Copyright (C) 2016  Inria
-# Copyright (C) 2014-2015  Université Bordeaux
 # 
 # 
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -21,9 +20,10 @@
 set -e # fail fast
 set -e # fail fast
 
 
 # File names
 # File names
-basename="$PWD"
+SOURCE_DIR=@abs_srcdir@
+
 outputfile="mlr_analysis.html"
 outputfile="mlr_analysis.html"
-analysis_script="$(dirname $(which $0))/starpu_mlr_analysis.Rmd"
+analysis_script="$SOURCE_DIR/starpu_mlr_analysis.Rmd"
 
 
 # Command line arguments
 # Command line arguments
 inputfile=""
 inputfile=""
@@ -31,13 +31,14 @@ inputfile=""
 help_script()
 help_script()
 {
 {
 cat << EOF
 cat << EOF
-Give statistical analysis of the paje trace
+Give an example of the trace analysis for computing multiple linear regression model
 
 
 Options:
 Options:
    -h      Show this message
    -h      Show this message
 
 
 Examples:
 Examples:
 $0 .starpu/sampling/codelets/tmp/test_mlr.out
 $0 .starpu/sampling/codelets/tmp/test_mlr.out
+$0 
 
 
 Report bugs to <@PACKAGE_BUGREPORT@>
 Report bugs to <@PACKAGE_BUGREPORT@>
 EOF
 EOF
@@ -48,13 +49,13 @@ if [ "$1" = "--version" ] ; then
     exit 0
     exit 0
 fi
 fi
 
 
-if [ "$1" = "-h" ] || [ "$1" = "--help" ] || [ "$1" = "" ] ; then
+if [ "$1" = "-h" ] || [ "$1" = "--help" ] ; then
     help_script
     help_script
     exit 0
     exit 0
 fi
 fi
 
 
 while getopts "h" opt; do
 while getopts "h" opt; do
-  case $opt in
+    case $opt in
     \?)
     \?)
       echo "Invalid option: -$OPTARG"
       echo "Invalid option: -$OPTARG"
       help_script
       help_script
@@ -66,12 +67,22 @@ done
 # Reading files that need to be analyzed
 # Reading files that need to be analyzed
 shift $((OPTIND - 1))
 shift $((OPTIND - 1))
 inputfile=$1
 inputfile=$1
-# Error if there is no input files specified
+if [[ $# < 1 ]]; then
-# if [[ $# != 1]]; then
+    inputfile="$SOURCE_DIR/perfmodels/sampling/codelets/tmp/mlr_init.out"
-#     echo "Error!"
+else
-#     help_script
+# Error if there is more than one input file
-#     exit 2
+    if [[ $# > 1 ]]; then
-# fi
+	echo "Error!"
+	help_script
+	exit 2
+    fi
+fi
+
+if [ ! -s $inputfile ]
+    then
+	echo "Error: file $inputfile does not exist!"
+	exit 5
+fi
 
 
 #####################################
 #####################################
 # Running analysis file to get actual results
 # Running analysis file to get actual results

+ 3 - 2
tools/starpu_paje_summary.in

@@ -21,9 +21,10 @@
 set -e # fail fast
 set -e # fail fast
 
 
 # File names
 # File names
-basename="$PWD"
+SOURCE_DIR=@abs_srcdir@
+
 outputfile="summary.html"
 outputfile="summary.html"
-analysis_script="$(dirname $(which $0))/starpu_paje_summary.Rmd"
+analysis_script="$SOURCE_DIR/starpu_paje_summary.Rmd"
 analysis_input=""
 analysis_input=""
 
 
 # Command line arguments
 # Command line arguments