Browse Source

mlr: started R analysis for finding the right parameters combination

Luka Stanisic 8 years ago
parent
commit
3e4ac00ff8
3 changed files with 298 additions and 0 deletions
  1. 2 0
      configure.ac
  2. 216 0
      tools/starpu_mlr_analysis.Rmd
  3. 80 0
      tools/starpu_mlr_analysis.in

+ 2 - 0
configure.ac

@@ -2863,6 +2863,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tools/starpu_paje_draw_histogram
   chmod +x tools/starpu_paje_state_stats
   chmod +x tools/starpu_paje_summary
+  chmod +x tools/starpu_mlr_analysis
   chmod +x tools/starpu_paje_sort
   chmod +x tools/starpu_smpirun
   chmod +x doc/doxygen/doxygen_filter.sh
@@ -2920,6 +2921,7 @@ AC_OUTPUT([
 	tools/starpu_paje_draw_histogram
 	tools/starpu_paje_state_stats
 	tools/starpu_paje_summary
+	tools/starpu_mlr_analysis
 	tools/starpu_paje_sort
 	tools/starpu_smpirun
 	socl/Makefile

+ 216 - 0
tools/starpu_mlr_analysis.Rmd

@@ -0,0 +1,216 @@
+
+```{r Setup, echo=FALSE}
+opts_chunk$set(echo=FALSE)
+```
+
+```{r Load_R_files_and_functions}
+print_codelet <- function(reg,codelet){
+   cat(paste("/* ############################################ */", "\n"))
+   cat(paste("/*\t Automatically generated code */", "\n"))
+   cat(paste("\t Check for potential errors and be sure parameter value are written in good order (alphabetical one by default)", "\n"))
+   cat(paste("\t Adjusted R-squared: ", summary(reg)$adj.r.squared, "*/\n\n"))
+
+   ncomb <- reg$rank - 1
+   cat(paste("\t ", codelet, ".model->ncombinations = ", ncomb, ";\n", sep=""))
+
+   cat(paste("\t ", codelet, ".model->combinations = (unsigned **) malloc(", codelet, ".model->ncombinations*sizeof(unsigned *))", ";\n\n", sep=""))
+
+   cat(paste("\t if (", codelet, ".model->combinations)", "\n", "\t {\n", sep=""))
+   cat(paste("\t   for (unsigned i = 0; i < ", codelet, ".model->ncombinations; i++)", "\n", "\t   {\n", sep=""))
+   cat(paste("\t     ", codelet, ".model->combinations[i] = (unsigned *) malloc(", codelet, ".model->nparameters*sizeof(unsigned))", ";\n", "\t   }\n", "\t }\n\n", sep=""))
+   
+   # Computing combinations
+   df <- data.frame(attr(reg$terms, "factors"))
+   df <- df/2
+   df$Params <- row.names(df)
+   df <-df[c(2:nrow(df)),]
+
+   i=1
+   options(warn=-1)
+   for(i in (1:nrow(df)))
+   {
+     name <- df[i,]$Params
+     if (grepl("I\\(*", name))
+     {
+        exp <- as.numeric(gsub("(.*?)\\^(.*?)\\)", "\\2", name))
+        df[i,] <- as.numeric(df[i,]) * exp
+        df[i,]$Params <- as.character(gsub("I\\((.*?)\\^(.*?)\\)", "\\1", name))
+     }
+   }
+   df <- aggregate(. ~ Params, transform(df, Params), sum)
+   options(warn=0)
+
+   i=1
+   j=1 
+   for(j in (2:length(df)))
+   {
+     for(i in (1:nrow(df)))
+     {
+       cat(paste("\t ", codelet, ".model->combinations[", j-2, "][", i-1, "] = ", as.numeric(df[i,j]), ";\n", sep=""))
+     }
+   }
+
+   cat(paste("/* ############################################ */", "\n"))
+}
+
+df<-read.csv(input_trace, header=TRUE)
+```
+
+# Introduction
+
+TODO
+
+### How to compile
+
+    ./starpu_mlr_analysis .starpu/sampling/codelets/tmp/test_mlr.out
+
+### Software dependencies
+
+In order to run the analysis you need to have R installed:
+
+    sudo apt-get install r-base 
+
+In order to compile this document, you need *knitr*. However, you can perfectly use the R code from this document without knitr in your own scripts. If you decided that you want to generate this document, then start R (e.g., from terminal) and install knitr package:
+
+    R> install.packages("knitr")
+
+No additional R packages are needed.
+
+# First glimpse
+
+First, we show the relations between all parameters in a single plot.
+
+```{r InitPlot}
+plot(df)
+```
+
+For this example, all three parameters M, N, K have some influence,
+but their relation is not easy to understand.
+
+In general, this type of plots can typically show if there is a group
+of parameters which are mutually perfectly correlated, in which case
+only a one parameter from the group should be kept for the further
+analysis. Additionally, plot can show the parameters that have a
+constant value, and since these cannot have an influence on the model,
+they should also be ignored.
+
+However, making conclusions based solely on the visual analysis can be
+treacherous and it is better to rely on the statistical tools. The
+multiple linear regression methods used in the following sections will
+also be able to detect and ignore these irrelevant
+parameters. Therefore, this initial visual look should only be used to
+get a basic idea about the model, but all the parameters should be
+kept for now.
+
+# Initial model
+
+At this point, an initial model is computed, using all the parameters,
+but not taking into account their exponents or the relations between
+them.
+
+```{r Model1}
+model1 <- lm(data=df, Duration ~ M+N+K)
+summary(model1)
+```
+
+For each parameter and the constant in the first column, an estimation
+of the corresponding coefficient is provided along with the 95%
+confidence interval. If there are any parameters with NA value, which
+suggests that the parameters are correlated to another parameter or
+that their value is constant, these parameters should not be used in
+the following model computations. The stars in the last column
+indicate the significance of each parameter. However, having maximum
+three stars for each parameter does not necessarily mean that the
+model is perfect and we should always inspect the adjusted R^2 value
+(the closer it is to 1, the better the model is). To the users that
+are not common to the multiple linear regression analysis and R tools,
+we suggest to the R documentation. Some explanations are also provided
+in the following article https://hal.inria.fr/hal-01180272.
+       
+In this example, all parameters M, N, K are all very
+important. However, it is not clear if there are some relations
+between them or if some of these parameters should be used with an
+exponent. Moreover, adjusted R^2 value is not extremelly high and we
+hope we can get a better one. Thus, we proceed to the more advanced
+analysis.
+
+# Refining the model
+
+Now, we can seek for the relations between the parameters. Note that
+trying all the possible combinations for the cases with a huge number
+of parameters can be prohibitively long. Thus, it may be better to first
+get rid of the parameters which seem to have very small influence
+(typically the ones with no stars from the table in the previous
+section).
+
+```{r Model2}
+model2 <- lm(data=df, Duration ~ M*N*K)
+summary(model2)
+```
+
+This model is more accurate, as the R^2 value increased. Now when some
+relations are observed, we can try some of these parameters with the
+exponents.
+
+```{r Model3}
+model3 <- lm(data=df, Duration ~ I(M^2)+I(M^3)+I(N^2)+I(N^3)+I(K^2)+I(K^3))
+summary(model3)
+```
+
+It seems like some parameters are important. Now we combine these and
+try to find the optimal combination.
+
+```{r Model4}
+model4 <- lm(data=df, Duration ~ I(M^2):N+I(N^3):K)
+summary(model4)
+```
+
+Depending on the machine characteristics and the variability of
+benchmarks, this may be the best model.
+
+# Validation
+
+Once the model has been computed, we should validate it. Apart from
+the low adjusted R^2 value, the model weakness can also be observed
+even better when inspecting the residuals. The results on two
+following plots (and thus the accuracy of the model) will greatly
+depend on the measurements variability and the design of experiments.
+
+```{r Validation}
+par(mfrow=c(1,2))
+plot(model4, which=c(1:2))
+```
+
+Generally speaking, if there are some structures on the left plot,
+this can indicate that there are certain phenomena not explained by
+the model. Many points on the same horizontal line represent
+repetitive occurrences of the task with the same parameter values,
+which is typical for a single experiment run with a homogeneous
+data. The fact that there is some variability is common, as executing
+exactly the same code on a real machine will always have slightly
+different duration. However, having a huge variability means that the
+benchmarks were very noisy, thus having an accurate models from them
+will be hard.
+
+Plot on the right may show that the residuals do not follow the normal
+distribution. Therefore, such model in overall would have a limited
+predictive power.
+
+If we are not satisfied with the accuracy of the observed models, we
+should go back to the previous section and try to find a better
+one. In some cases, the benchmarked data will just be too noisy and
+they should be redesigned and run again.
+
+When we are finally satisfied with the model accuracy, we should
+modify our task code, so that StarPU knows which parameters
+combinations are used in the model.
+
+# Generating C code
+
+This is a simple helper to generate C code which should be copied to
+the task description in your application. Make sure that the generated
+code correctly corresponds to computed model.
+
+```{r Code}
+print_codelet(model4, "mlr_cl")
+```

+ 80 - 0
tools/starpu_mlr_analysis.in

@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2014  Université Joseph Fourier
+# Copyright (C) 2014-2015  Université Bordeaux
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Script for giving statistical analysis of the paje trace
+
+set -e # fail fast
+
+# File names
+basename="$PWD"
+outputfile="mlr_analysis.html"
+analysis_script="$(dirname $(which $0))/starpu_mlr_analysis.Rmd"
+
+# Command line arguments
+inputfile=""
+
+help_script()
+{
+cat << EOF
+Give statistical analysis of the paje trace
+
+Options:
+   -h      Show this message
+
+Examples:
+$0 .starpu/sampling/codelets/tmp/test_mlr.out
+
+Report bugs to <@PACKAGE_BUGREPORT@>
+EOF
+}
+
+if [ "$1" = "--version" ] ; then
+    echo "$PROGNAME (@PACKAGE_NAME@) @PACKAGE_VERSION@"
+    exit 0
+fi
+
+if [ "$1" = "-h" ] || [ "$1" = "--help" ] || [ "$1" = "" ] ; then
+    help_script
+    exit 0
+fi
+
+while getopts "h" opt; do
+  case $opt in
+    \?)
+      echo "Invalid option: -$OPTARG"
+      help_script
+      exit 3
+      ;;
+  esac
+done
+
+# Reading files that need to be analyzed
+shift $((OPTIND - 1))
+inputfile=$1
+# Error if there is no input files specified
+# if [[ $# != 1]]; then
+#     echo "Error!"
+#     help_script
+#     exit 2
+# fi
+
+#####################################
+# Running analysis file to get actual results
+Rscript -e "library(knitr); input_trace = '$inputfile' ; outputhtml='$outputfile';\
+            outputRmd = gsub('.html\$','.Rmd',outputhtml);\
+            knit('$analysis_script',output=outputRmd); knitr::knit2html(outputRmd)"