Corentin Salingue лет назад: 9
Родитель
Сommit
0dbf966aef
100 измененных файлов с 22841 добавлено и 64 удалено
  1. 1 0
      AUTHORS
  2. 2 0
      ChangeLog
  3. 8 1
      Makefile.am
  4. 96 18
      configure.ac
  5. 3 1
      doc/doxygen/Makefile.am
  6. 43 1
      doc/doxygen/chapters/370_online_performance_tools.doxy
  7. 96 0
      doc/doxygen/chapters/390_faq.doxy
  8. 11 0
      doc/doxygen/chapters/450_native_fortran_support.doxy
  9. 2 1
      doc/doxygen/chapters/470_simgrid.doxy
  10. 30 1
      doc/doxygen/chapters/api/data_management.doxy
  11. 27 4
      doc/doxygen/chapters/api/performance_model.doxy
  12. 2 2
      doc/tutorial/hello_world.c
  13. 20 0
      examples/Makefile.am
  14. 37 0
      examples/cpp/Makefile.add_vectors
  15. 37 0
      examples/cpp/Makefile.add_vectors_cpp11
  16. 148 0
      examples/cpp/add_vectors.cpp
  17. 155 0
      examples/cpp/add_vectors_cpp11.cpp
  18. 1 0
      examples/heat/dw_factolu_grain.c
  19. 172 0
      examples/mlr/mlr.c
  20. 3 2
      examples/ppm_downscaler/yuv_downscaler.c
  21. 4 1
      examples/sched_ctx/parallel_tasks_reuse_handle.c
  22. 4 0
      examples/scheduler/dummy_sched.c
  23. 1 1
      examples/scheduler/heteroprio_test.c
  24. 7 6
      examples/spmv/dw_block_spmv.c
  25. 7 7
      examples/spmv/matrix_market/mmio.c
  26. 1 0
      examples/spmv/matrix_market/mmio.h
  27. 29 2
      examples/stencil/Makefile.am
  28. 4 0
      examples/stencil/README
  29. 444 0
      examples/stencil/implicit-stencil-blocks.c
  30. 769 0
      examples/stencil/implicit-stencil-kernels.c
  31. 204 0
      examples/stencil/implicit-stencil-tasks.c
  32. 387 0
      examples/stencil/implicit-stencil.c
  33. 157 0
      examples/stencil/implicit-stencil.h
  34. 1 1
      examples/stencil/stencil-blocks.c
  35. 5 5
      examples/worker_collections/worker_tree_example.c
  36. 1 1
      include/starpu.h
  37. 2 0
      include/starpu_config.h.in
  38. 4 0
      include/starpu_data.h
  39. 17 1
      include/starpu_perfmodel.h
  40. 4 0
      include/starpu_profiling.h
  41. 3 3
      include/starpu_util.h
  42. 562 0
      m4/ax_cxx_compile_stdcxx.m4
  43. 25 5
      mic-configure
  44. 30 0
      min-dgels/Makefile.in
  45. 160 0
      min-dgels/additional/blaswrap.h
  46. 7262 0
      min-dgels/additional/clapack.h
  47. 21 0
      min-dgels/additional/d_lg10.c
  48. 18 0
      min-dgels/additional/d_sign.c
  49. 107 0
      min-dgels/additional/dcopy.c
  50. 157 0
      min-dgels/additional/dgelq2.c
  51. 251 0
      min-dgels/additional/dgelqf.c
  52. 515 0
      min-dgels/additional/dgels.c
  53. 389 0
      min-dgels/additional/dgemm.c
  54. 312 0
      min-dgels/additional/dgemv.c
  55. 161 0
      min-dgels/additional/dgeqr2.c
  56. 252 0
      min-dgels/additional/dgeqrf.c
  57. 194 0
      min-dgels/additional/dger.c
  58. 52 0
      min-dgels/additional/disnan.c
  59. 72 0
      min-dgels/additional/dlabad.c
  60. 58 0
      min-dgels/additional/dlaisnan.c
  61. 1001 0
      min-dgels/additional/dlamch.c
  62. 199 0
      min-dgels/additional/dlange.c
  63. 73 0
      min-dgels/additional/dlapy2.c
  64. 193 0
      min-dgels/additional/dlarf.c
  65. 774 0
      min-dgels/additional/dlarfb.c
  66. 170 0
      min-dgels/additional/dlarfg.c
  67. 192 0
      min-dgels/additional/dlarfp.c
  68. 325 0
      min-dgels/additional/dlarft.c
  69. 354 0
      min-dgels/additional/dlascl.c
  70. 152 0
      min-dgels/additional/dlaset.c
  71. 116 0
      min-dgels/additional/dlassq.c
  72. 95 0
      min-dgels/additional/dnrm2.c
  73. 235 0
      min-dgels/additional/dorm2r.c
  74. 231 0
      min-dgels/additional/dorml2.c
  75. 334 0
      min-dgels/additional/dormlq.c
  76. 327 0
      min-dgels/additional/dormqr.c
  77. 96 0
      min-dgels/additional/dscal.c
  78. 453 0
      min-dgels/additional/dtrmm.c
  79. 345 0
      min-dgels/additional/dtrmv.c
  80. 490 0
      min-dgels/additional/dtrsm.c
  81. 183 0
      min-dgels/additional/dtrtrs.c
  82. 223 0
      min-dgels/additional/f2c.h
  83. 141 0
      min-dgels/additional/fio.h
  84. 530 0
      min-dgels/additional/fmt.c
  85. 105 0
      min-dgels/additional/fmt.h
  86. 166 0
      min-dgels/additional/ieeeck.c
  87. 88 0
      min-dgels/additional/iladlc.c
  88. 90 0
      min-dgels/additional/iladlr.c
  89. 654 0
      min-dgels/additional/ilaenv.c
  90. 282 0
      min-dgels/additional/iparmq.c
  91. 117 0
      min-dgels/additional/lsame.c
  92. 8 0
      min-dgels/additional/mindgels.h
  93. 41 0
      min-dgels/additional/pow_di.c
  94. 86 0
      min-dgels/additional/s_cat.c
  95. 66 0
      min-dgels/additional/sysdep1.h
  96. 78 0
      min-dgels/additional/wsfe.c
  97. 65 0
      min-dgels/additional/xerbla.c
  98. 115 0
      min-dgels/base/BLAS/SRC/Makefile
  99. 101 0
      min-dgels/base/BLAS/SRC/dasum.c
  100. 0 0
      min-dgels/base/BLAS/SRC/daxpy.c

+ 1 - 0
AUTHORS

@@ -4,6 +4,7 @@ Olivier Aumage <olivier.aumage@inria.fr>
 William Braik <wbraik@gmail.com>
 Berenger Bramas <berenger.bramas@inria.fr>
 Alfredo Buttari <alfredo.buttari@enseeiht.fr>
+Adrien Cassagne <adrien.cassagne@inria.fr>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
 Ludovic Courtès <ludovic.courtes@inria.fr>

+ 2 - 0
ChangeLog

@@ -22,6 +22,7 @@ New features:
   * New scheduler with heterogeneous priorities
   * Support priorities for data transfers.
   * Add support for Ayudame version 2.x debugging library.
+  * Add support for multiple linear regression performance models
 
 Small features:
   * Scheduling contexts may now be associated a user data pointer at creation
@@ -40,6 +41,7 @@ New features:
   * Add starpu_tasks_rec_complete tool to add estimation times in tasks.rec
     files.
   * Add STARPU_FXT_TRACE environment variable.
+  * Add starpu_data_set_user_data and starpu_data_get_user_data.
 
 StarPU 1.2.0 (svn revision 18521)
 ==============================================

+ 8 - 1
Makefile.am

@@ -19,7 +19,14 @@
 ACLOCAL_AMFLAGS=-I m4
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
-SUBDIRS = src
+SUBDIRS = 
+
+if STARPU_USE_MIN_DGELS
+SUBDIRS += min-dgels
+endif
+
+SUBDIRS += src
+
 SUBDIRS += tools
 
 if BUILD_TESTS

+ 96 - 18
configure.ac

@@ -81,6 +81,14 @@ AC_PROG_FC
 AC_CHECK_PROGS(PROG_STAT,gstat stat)
 AC_CHECK_PROGS(PROG_DATE,gdate date)
 AC_OPENMP
+#c++11 detection
+AX_CXX_COMPILE_STDCXX(11,noext,optional)
+
+AC_SUBST([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
+AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
+if test $HAVE_CXX11 -eq 1; then
+  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
+fi
 
 if test x$enable_perf_debug = xyes; then
     enable_shared=no
@@ -581,6 +589,7 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 	    AC_MSG_WARN(['nvcc' not found, disabling CUDA])
 	    have_valid_cuda=no
 	else
+	    # This is for very old cuda, to enable the use of double etc.
 	    AC_MSG_CHECKING(whether nvcc supports sm_13 architecture)
 	    OLD_NVCCFLAGS="$NVCCFLAGS"
 	    NVCCFLAGS="$NVCCFLAGS -arch sm_13"
@@ -593,6 +602,21 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 		AC_MSG_RESULT(no)
 		NVCCFLAGS="$OLD_NVCCFLAGS"
 	    fi
+
+	    # This is for recent cuda, which complains if we don't actually set an arch!?
+	    AC_MSG_CHECKING(whether nvcc supports -Wno-deprecated-gpu-targets)
+	    OLD_NVCCFLAGS="$NVCCFLAGS"
+	    NVCCFLAGS="$NVCCFLAGS -Wno-deprecated-gpu-targets"
+	    echo "int main(int argc, char **argv) { return 0;}" > cuda_test.cu
+	    $NVCC $NVCCFLAGS -c cuda_test.cu >/dev/null 2>&1
+	    if test $? -eq 0
+	    then
+		AC_MSG_RESULT(yes)
+	    else
+		AC_MSG_RESULT(no)
+		NVCCFLAGS="$OLD_NVCCFLAGS"
+	    fi
+
 	    rm -f cuda_test*
 	fi
 
@@ -1147,6 +1171,46 @@ fi
 
 ###############################################################################
 #                                                                             #
+#			 Multiple linear regression			      #
+#                                                                             #
+###############################################################################
+AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
+			[Disable multiple linear regression models])],
+			enable_mlr=$enableval, enable_mlr=yes)
+
+AC_MSG_CHECKING(whether multiple linear regression models are disabled)
+if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
+   	AC_MSG_RESULT(no)
+	install_min_dgels=no
+   	STARPU_SEARCH_LIBS(LAPACK,[dgels_],[lapack],use_system_lapack=yes,,)
+	if test x$use_system_lapack = xyes; then
+	        AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
+		LDFLAGS="-llapack $LDFLAGS"
+	else
+		AC_MSG_CHECKING(whether min-dgels is linked)
+		if test x"$DGELS_LIBS" != x; then
+		   	AC_MSG_RESULT(yes)
+        		AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
+			AC_ARG_VAR([DGELS_LIBS], [linker flags for lapack dgels])
+		else
+			AC_MSG_RESULT(no)
+			AC_MSG_CHECKING(min-dgels source)
+			cp -r $srcdir/min-dgels $PWD/
+			AC_MSG_RESULT(yes)
+			DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/*.a -Wl,--end-group"
+			AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
+			AC_ARG_VAR([DGELS_LIBS], [linker flags for lapack dgels])
+			install_min_dgels=yes
+		fi
+	fi
+else
+ 	AC_MSG_RESULT(yes)
+	install_min_dgels=no
+fi
+AM_CONDITIONAL(STARPU_USE_MIN_DGELS, test x$install_min_dgels = xyes)
+
+###############################################################################
+#                                                                             #
 #                                 MIC settings                                #
 #                                                                             #
 ###############################################################################
@@ -1604,8 +1668,8 @@ if test x$use_fxt = xyes; then
 	LIBS="$save_LIBS"
 	save_CFLAGS="$CFLAGS"
 	CFLAGS="$CFLAGS $FXT_CFLAGS"
-	AC_CHECK_DECLS([enable_fut_flush])
-	AC_CHECK_DECLS([fut_set_filename])
+	AC_CHECK_DECLS([enable_fut_flush], [], [], [[#include <fut.h>]])
+	AC_CHECK_DECLS([fut_set_filename], [], [], [[#include <fut.h>]])
 	CFLAGS="$save_CFLAGS"
 
 	if test x$enable_simgrid = xyes -a x$enable_shared = xno ; then
@@ -2296,6 +2360,7 @@ AC_SUBST([pkglibdir])
 AC_ARG_ENABLE(fortran, [AS_HELP_STRING([--disable-fortran],
 			[disable build of fortran examples])],
 			enable_build_fortran=$enableval, enable_build_fortran=yes)
+use_mpi_fort=no
 if test "x$FC" != "x"; then
 	if $FC --version|grep -q 'GNU Fortran'; then
 		gfortran_fc_version=`$FC --version|head -1|sed 's/.*)//;s/^.*\([[0-9]][[0-9]]*\)\.\([[0-9]][[0-9]]*\)\.\([[0-9]][[0-9]]*\).*/\1.\2.\3/'`
@@ -2316,7 +2381,15 @@ if test "x$FC" != "x"; then
 				enable_build_fortran="no"
 			fi
 		else
-			AC_MSG_WARN(Fortran compiler has not been tested for StarPU native Fortran support)
+			if $FC -qversion 2>&1|grep -q 'IBM XL Fortran'; then
+				xlf_fc_version=`$FC -V 2>&1 |tail -1|sed 's/.*Version: //'`
+				xlf_maj_version=`echo $xlf_fc_version|cut -d. -f1`
+
+				AC_MSG_WARN([IBM Fortran compiler $xlf_fc_version not validated with the native StarPU Fortran API, Fortran examples will not be built])
+				enable_build_fortran="no"
+			else
+				AC_MSG_WARN(Fortran compiler has not been tested for StarPU native Fortran support)
+			fi
 		fi
 	fi
 	if test "x$enable_build_fortran" = "xyes" ; then
@@ -2935,7 +3008,7 @@ AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
 ###############################################################################
 
 # these are the flags needed for linking libstarpu (and thus also for static linking)
-LIBSTARPU_LDFLAGS="$HWLOC_LIBS $FXT_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS"
+LIBSTARPU_LDFLAGS="$HWLOC_LIBS $FXT_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
 AC_SUBST([LIBSTARPU_LDFLAGS])
 
 LIBSTARPU_LINK=libstarpu-$STARPU_EFFECTIVE_VERSION.la
@@ -2965,11 +3038,13 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tools/starpu_paje_draw_histogram
   chmod +x tools/starpu_paje_state_stats
   chmod +x tools/starpu_paje_summary
+  chmod +x tools/starpu_mlr_analysis
   chmod +x tools/starpu_paje_sort
   chmod +x tools/starpu_smpirun
   chmod +x doc/doxygen/doxygen_filter.sh
   mkdir -p tests/microbenchs
   test -e tests/microbenchs/tasks_size_overhead.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead.sh tests/microbenchs/
+  test -e tests/microbenchs/tasks_size_overhead_sched.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead_sched.sh tests/microbenchs/
   test -e tests/microbenchs/tasks_size_overhead_scheds.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead_scheds.sh tests/microbenchs/
   test -e tests/microbenchs/tasks_size_overhead.gp || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead.gp tests/microbenchs/
   test -e tests/microbenchs/microbench.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/microbench.sh tests/microbenchs/
@@ -3022,6 +3097,7 @@ AC_OUTPUT([
 	tools/starpu_paje_draw_histogram
 	tools/starpu_paje_state_stats
 	tools/starpu_paje_summary
+	tools/starpu_mlr_analysis
 	tools/starpu_paje_sort
 	tools/starpu_smpirun
 	socl/Makefile
@@ -3073,6 +3149,7 @@ AC_OUTPUT([
 	doc/doxygen/doxygen-config.cfg
 	doc/doxygen/doxygen_filter.sh
 	tools/msvc/starpu_var.bat
+	min-dgels/Makefile
 ])
 
 AC_MSG_NOTICE([
@@ -3112,20 +3189,21 @@ AC_MSG_NOTICE([
         Examples:          $enable_build_examples
 
 	StarPU Extensions:
-	       StarPU MPI enabled:                          $build_mpi_lib
-	       MPI test suite:                              $running_mpi_check
-	       Master-Slave MPI enabled:                    $use_mpi_master_slave
-	       FFT Support:                                 $fft_support
-	       GCC plug-in:                                 $build_gcc_plugin
-	       GCC plug-in test suite (requires GNU Guile): $run_gcc_plugin_test_suite
-	       OpenMP runtime support enabled:              $enable_openmp
-	       SOCL enabled:                                $build_socl
-               SOCL test suite:                             $run_socl_check
-               Scheduler Hypervisor:                        $build_sc_hypervisor
-               simgrid enabled:                             $enable_simgrid
-               ayudame enabled:                             $ayu_msg
-	       Native fortran support:                      $enable_build_fortran
-	       Native MPI fortran support:                  $use_mpi_fort
+	       StarPU MPI enabled:                            $build_mpi_lib
+	       MPI test suite:                                $running_mpi_check
+	       Master-Slave MPI enabled:                      $use_mpi_master_slave
+	       FFT Support:                                   $fft_support
+	       GCC plug-in:                                   $build_gcc_plugin
+	       GCC plug-in test suite (requires GNU Guile):   $run_gcc_plugin_test_suite
+	       OpenMP runtime support enabled:                $enable_openmp
+	       SOCL enabled:                                  $build_socl
+               SOCL test suite:                               $run_socl_check
+               Scheduler Hypervisor:                          $build_sc_hypervisor
+               simgrid enabled:                               $enable_simgrid
+               ayudame enabled:                               $ayu_msg
+	       Native fortran support:                        $enable_build_fortran
+	       Native MPI fortran support:                    $use_mpi_fort
+	       Support for multiple linear regression models: $install_min_dgels
 ])
 
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then

+ 3 - 1
doc/doxygen/Makefile.am

@@ -162,6 +162,7 @@ chapters/version.html: $(chapters)
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 
+if BUILD_DOC
 EXTRA_DIST	= 					\
 	$(chapters) 					\
 	chapters/version.sty				\
@@ -209,6 +210,7 @@ EXTRA_DIST	= 					\
 	doxygen.cfg 					\
 	refman.tex					\
 	$(DOX_HTML_DIR)
+endif
 
 dox_inputs = $(DOX_CONFIG) 				\
 	$(chapters) 					\
@@ -277,7 +279,7 @@ $(DOX_TAG): $(dox_inputs)
 	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
 
 if BUILD_DOC
-dist_pdf_DATA = $(DOX_PDF)
+EXTRA_DIST += $(DOX_PDF)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)

+ 43 - 1
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -2,7 +2,7 @@
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
- * Copyright (C) 2011, 2012 INRIA
+ * Copyright (C) 2011, 2012, 2016 INRIA
  * See the file version.doxy for copying conditions.
  */
 
@@ -397,6 +397,48 @@ performance model to perform scheduling, without using regression.
 </li>
 
 <li>
+
+Another type of model is ::STARPU_MULTIPLE_REGRESSION_BASED, which
+is based on multiple linear regression. In this model, the user
+defines both the relevant parameters and the equation for computing the
+task duration.
+
+
+\f[
+T_{kernel} = a + b(M^{\alpha_1} * N^{\beta_1} * K^{\gamma_1}) + c(M^{\alpha_2} * N^{\beta_2} * K^{\gamma_2}) + ...
+\f]
+
+
+\f$M, N, K\f$ are the parameters of the task, added at the task
+creation. These need to be extracted by the <c>cl_perf_func</c>
+function, which should be defined by the user. \f$\alpha, \beta,
+\gamma\f$ are the exponents defined by the user in
+<c>model->combinations</c> table. Finally, coefficients \f$a, b, c\f$
+are computed automatically by the StarPU at the end of the execution, using least
+squares method of the <c>dgels_</c> LAPACK function.
+
+<c>examples/mlr/mlr.c</c> example provides more details on
+the usage of ::STARPU_MULTIPLE_REGRESSION_BASED models.
+
+Coefficients computation is done at the end of the execution, and the
+results are stored in standard codelet perfmodel files. Additional
+files containing the duration of task together with the value of each
+parameter are stored in <c>.starpu/sampling/codelets/tmp/</c>
+directory. These files are reused when \ref STARPU_CALIBRATE
+environment variable is set to <c>1</c>, to recompute coefficients
+based on the current, but also on the previous
+executions. Additionally, when multiple linear regression models are
+disabled (using "--disable-mlr" configuration option) or when the
+<c>model->combinations</c> are not defined, StarPU will still write
+output files into <c>.starpu/sampling/codelets/tmp/</c> to allow
+performing an analysis. This analysis typically aims at finding the
+most appropriate equation for the codelet and
+<c>tools/starpu_mlr_analysis</c> script provides an example of how to
+perform such study.
+
+</li>
+
+<li>
 Provided as an estimation from the application itself (model type
 ::STARPU_COMMON and field starpu_perfmodel::cost_function),
 see for instance

+ 96 - 0
doc/doxygen/chapters/390_faq.doxy

@@ -226,4 +226,100 @@ starpu_resume();
 starpu_shutdown();
 \endcode
 
+\section CUDADrivers StarPU does not see my CUDA device
+
+First make sure that CUDA is properly running outside StarPU: build and
+run the following program with -lcudart:
+
+\code{.c}
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+int main(void) {
+	int n, i, version;
+	cudaError_t err;
+
+	err = cudaGetDeviceCount(&n);
+	if (err)
+	{
+		fprintf(stderr,"cuda error %d\n", err);
+		exit(1);
+	}
+	cudaDriverGetVersion(&version);
+	printf("driver version %d\n", version);
+	cudaRuntimeGetVersion(&version);
+	printf("runtime version %d\n", version);
+	printf("\n");
+
+	for (i = 0; i < n; i++) {
+		struct cudaDeviceProp props;
+		printf("CUDA%d\n", i);
+		err = cudaGetDeviceProperties(&props, i);
+		if (err)
+		{
+			fprintf(stderr,"cuda error %d\n", err);
+			continue;
+		}
+		printf("%s\n", props.name);
+		printf("%0.3f GB\n", (float) props.totalGlobalMem / (1<<30));
+		printf("%u MP\n", props.multiProcessorCount);
+		printf("\n");
+	}
+	return 0;
+}
+\endcode
+
+If that program does not find your device, the problem is not at the StarPU
+level, but the CUDA drivers, check the documentation of your CUDA
+setup.
+
+\section OpenCLDrivers StarPU does not see my OpenCL device
+
+First make sure that OpenCL is properly running outside StarPU: build and
+run the following program with -lOpenCL:
+
+\code{.c}
+#include <CL/cl.h>
+#include <stdio.h>
+#include <assert.h>
+
+int main(void) {
+    cl_device_id did[16];
+    cl_int err;
+    cl_platform_id pid, pids[16];
+    cl_uint nbplat, nb;
+    char buf[128];
+    size_t size;
+    int i, j;
+
+    err = clGetPlatformIDs(sizeof(pids)/sizeof(pids[0]), pids, &nbplat);
+    assert(err == CL_SUCCESS);
+    printf("%u platforms\n", nbplat);
+    for (j = 0; j < nbplat; j++) {
+        pid = pids[j];
+        printf("    platform %d\n", j);
+        err = clGetPlatformInfo(pid, CL_PLATFORM_VERSION, sizeof(buf)-1, buf, &size);
+        assert(err == CL_SUCCESS);
+        buf[size] = 0;
+        printf("        platform version %s\n", buf);
+
+        err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, sizeof(did)/sizeof(did[0]), did, &nb);
+        assert(err == CL_SUCCESS);
+        printf("%d devices\n", nb);
+        for (i = 0; i < nb; i++) {
+            err = clGetDeviceInfo(did[i], CL_DEVICE_VERSION, sizeof(buf)-1, buf, &size);
+            buf[size] = 0;
+            printf("    device %d version %s\n", i, buf);
+        }
+    }
+
+    return 0;
+}
+\endcode
+
+If that program does not find your device, the problem is not at the StarPU
+level, but the OpenCL drivers, check the documentation of your OpenCL
+implementation.
+
 */

+ 11 - 0
doc/doxygen/chapters/450_native_fortran_support.doxy

@@ -30,6 +30,9 @@ standard. It has currently been tested successfully with GNU GFortran 4.9,
 GFortran 5.x, GFortran 6.x and the Intel Fortran Compiler >= 2016. It is known
 not to work with GNU GFortran < 4.9, Intel Fortran Compiler < 2016.
 
+See Section \ref OldFortran on information on how to write StarPU
+Fortran code with older compilers.
+
 \subsection Configuration Configuration
 
 The Native Fortran API is enabled and its companion
@@ -209,6 +212,14 @@ with StarPU using C marshalling functions as exemplified in StarPU's
 <c>examples/fortran/</c> and <c>examples/fortran90/</c> example
 directories, though the process will be less convenient.
 
+Basically, the main FORTRAN code calls some C wrapper functions to
+submit tasks to StarPU. Then, when StarPU starts a task, another C
+wrapper function calls the FORTRAN routine for the task.
+
+Note that this marshalled FORTRAN support remains available even
+when specifying configure option \ref disable-fortran "--disable-fortran"
+(which only disables StarPU's native Fortran layer).
+
 \subsection APIMIX Valid API Mixes and Language Mixes
 
 Mixing uses of

+ 2 - 1
doc/doxygen/chapters/470_simgrid.doxy

@@ -61,7 +61,8 @@ run several times before the model is calibrated.
 \section Simulation Simulation
 
 Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
-to <c>./configure</c>.
+to <c>./configure</c>. Make sure to keep all other <c>./configure</c> options
+the same, and notably options such as <c>--enable-maxcudadev</c>.
 
 \verbatim
 $ ./configure --enable-simgrid

+ 30 - 1
doc/doxygen/chapters/api/data_management.doxy

@@ -243,6 +243,17 @@ done with the codelet \p redux_cl.
 \ingroup API_Data_Management
 todo
 
+\fn void starpu_data_set_user_data(starpu_data_handle_t handle, void* user_data)
+\ingroup API_Data_Management
+This sets the "user_data" field for the \p handle to \p user_data . It can
+then be retrieved with starpu_data_get_user_data. \p user_data can be any
+application-defined value, for instance a pointer to an object-oriented
+container for the data.
+
+\fn void *starpu_data_get_user_data(starpu_data_handle_t handle)
+\ingroup API_Data_Management
+This retrieves the "user_data" field previously set for the \p handle .
+
 @name Access registered data from the application
 \ingroup API_Data_Management
 
@@ -291,22 +302,39 @@ Similarly to starpu_data_acquire_cb(), this function is
 non-blocking and may be called from task callbacks. Upon successful
 completion, this function returns 0.
 
+\def STARPU_ACQUIRE_NO_NODE
+\ingroup API_Data_Management
+This macro can be used to acquire data, but not require it to be available on a given node, only enforce R/W dependencies.
+This can for instance be used to wait for tasks which produce the data, but without requesting a fetch to the main memory.
+
+\def STARPU_ACQUIRE_ALL_NODES
+\ingroup API_Data_Management
+This is the same as STARPU_ACQUIRE_NO_NODE, but will lock the data on all nodes, preventing them from being evicted for instance.
+This is mostly useful inside starpu only.
+
 \fn int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode)
 \ingroup API_Data_Management
 This is the same as starpu_data_acquire(), except that the data
-will be available on the given memory node instead of main memory.
+will be available on the given memory node instead of main
+memory.
+STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_ALL_NODES can be used instead of an
+explicit node number.
 
 \fn int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 \ingroup API_Data_Management
 This is the same as starpu_data_acquire_cb(), except that the
 data will be available on the given memory node instead of main
 memory.
+STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_ALL_NODES can be used instead of an
+explicit node number.
 
 \fn int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 \ingroup API_Data_Management
 This is the same as starpu_data_acquire_cb_sequential_consistency(), except that the
 data will be available on the given memory node instead of main
 memory.
+STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_ALL_NODES can be used instead of an
+explicit node number.
 
 \def STARPU_DATA_ACQUIRE_CB(handle, mode, code)
 \ingroup API_Data_Management
@@ -327,6 +355,7 @@ starpu_data_acquire_cb().
 \ingroup API_Data_Management
 This is the same as starpu_data_release(), except that the data
 will be available on the given memory \p node instead of main memory.
+The \p node parameter must be exactly the same as the corresponding starpu_data_acquire_on_node* call.
 
 \fn starpu_arbiter_t starpu_arbiter_create(void)
 \ingroup API_Data_Management

+ 27 - 4
doc/doxygen/chapters/api/performance_model.doxy

@@ -2,7 +2,7 @@
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
- * Copyright (C) 2011, 2012 INRIA
+ * Copyright (C) 2011, 2012, 2016 INRIA
  * See the file version.doxy for copying conditions.
  */
 
@@ -26,6 +26,9 @@ Automatic linear regression-based cost model  (alpha * size ^ beta)
 \var starpu_perfmodel_type::STARPU_NL_REGRESSION_BASED
 \ingroup API_Performance_Model
 Automatic non-linear regression-based cost model (a * size ^ b + c)
+\var starpu_perfmodel_type::STARPU_MULTIPLE_REGRESSION_BASED
+\ingroup API_Performance_Model
+Automatic multiple linear regression-based cost model. Application provides parameters, their combinations and exponents
 
 \struct starpu_perfmodel_device
 todo
@@ -60,6 +63,8 @@ is the type of performance model
 ::STARPU_NL_REGRESSION_BASED: No other fields needs to be provided,
 this is purely history-based.
 </li>
+<li> ::STARPU_MULTIPLE_REGRESSION_BASED: Need to provide fields starpu_perfmodel::nparameters (number of different parameters),  starpu_perfmodel::ncombinations (number of parameters combinations-tuples) and table starpu_perfmodel::combinations which defines exponents of the equation. Function cl_perf_func also needs to define how to extract parameters from the task.
+</li>
 <li> ::STARPU_PER_ARCH: either field starpu_perfmodel::arch_cost_function has to be
 filled with a function that returns the cost in micro-seconds on the arch given
 as parameter, or field starpu_perfmodel::per_arch has to be
@@ -99,6 +104,18 @@ Whether the performance model is already loaded from the disk.
 todo
 \var starpu_perfmodel_state_t starpu_perfmodel::state
 \private
+\var const char ** starpu_perfmodel::parameters_names
+\private
+Names of parameters used for multiple linear regression models (M, N, K)
+\var unsigned starpu_perfmodel::nparameters
+\private
+Number of parameters used for multiple linear regression models
+\var unsigned ** starpu_perfmodel::combinations
+\private
+Table of combinations of parameters (and the exponents) used for multiple linear regression models
+\var unsigned starpu_perfmodel::ncombinations
+\private
+Number of combination of parameters used for multiple linear regression models
 
 
 \struct starpu_perfmodel_regression_model
@@ -132,6 +149,12 @@ estimated = a size ^b + c
 whether the non-linear regression model is valid (i.e. enough measures)
 \var unsigned starpu_perfmodel_regression_model::nsample
 number of sample values for non-linear regression
+\var double starpu_perfmodel_regression_model::coeff[]
+list of computed coefficients for multiple linear regression model
+\var double starpu_perfmodel_regression_model::ncoeff
+number of coefficients for multiple linear regression model
+\var double starpu_perfmodel_regression_model::multi_valid
+whether the multiple linear regression model is valid
 
 \struct starpu_perfmodel_per_arch
 contains information about the performance model of a given
@@ -150,12 +173,12 @@ depends on the architecture-specific implementation.
 The history of performance measurements.
 \var struct starpu_perfmodel_history_list *starpu_perfmodel_per_arch::list
 \private
-Used by ::STARPU_HISTORY_BASED and ::STARPU_NL_REGRESSION_BASED,
+Used by ::STARPU_HISTORY_BASED, ::STARPU_NL_REGRESSION_BASED and ::STARPU_MULTIPLE_REGRESSION_BASED,
 records all execution history measures.
 \var struct starpu_perfmodel_regression_model starpu_perfmodel_per_arch::regression
 \private
-Used by ::STARPU_REGRESSION_BASED and
-::STARPU_NL_REGRESSION_BASED, contains the estimated factors of the
+Used by ::STARPU_REGRESSION_BASED, 
+::STARPU_NL_REGRESSION_BASED and ::STARPU_MULTIPLE_REGRESSION_BASED, contains the estimated factors of the
 regression.
 
 \struct starpu_perfmodel_history_list

+ 2 - 2
doc/tutorial/hello_world.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux
- * Copyright (C) 2010-2011, 2013  CNRS
+ * Copyright (C) 2010-2011, 2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -38,7 +38,7 @@ struct starpu_codelet cl =
 
 void callback_func(void *callback_arg)
 {
-	printf("Callback function (arg %x)\n", callback_arg);
+	printf("Callback function (arg %p)\n", callback_arg);
 }
 
 int main(int argc, char **argv)

+ 20 - 0
examples/Makefile.am

@@ -199,7 +199,9 @@ STARPU_EXAMPLES +=				\
 	basic_examples/variable			\
 	basic_examples/multiformat              \
 	basic_examples/dynamic_handles          \
+	mlr/mlr					\
 	cpp/incrementer_cpp			\
+	cpp/add_vectors				\
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
@@ -228,12 +230,18 @@ STARPU_EXAMPLES +=				\
 	worker_collections/worker_tree_example  \
 	reductions/dot_product			\
 	reductions/minmax_reduction
+
 endif
 
 if !STARPU_SIMGRID
 STARPU_EXAMPLES +=				\
 	scheduler/dummy_sched
 
+if STARPU_HAVE_CXX11
+STARPU_EXAMPLES +=	\
+	cpp/add_vectors_cpp11
+endif
+
 if STARPU_HAVE_F77
 if STARPU_HAVE_F77_H
 STARPU_EXAMPLES +=				\
@@ -837,6 +845,18 @@ cpp_incrementer_cpp_SOURCES += \
 	incrementer/incrementer_kernels_opencl.c
 endif
 
+###########################
+# C++ Add vectors example #
+###########################
+
+cpp_add_vectors_SOURCES	=	\
+	cpp/add_vectors.cpp
+	
+if STARPU_HAVE_CXX11
+cpp_add_vectors_cpp11_SOURCES	=	\
+	cpp/add_vectors_cpp11.cpp
+endif
+
 #######################
 # Incrementer example #
 #######################

+ 37 - 0
examples/cpp/Makefile.add_vectors

@@ -0,0 +1,37 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016  Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+PROG = add_vectors
+
+SRCCXX = add_vectors.cpp
+
+CC = g++
+
+CFLAGS = -g -DPRINT_OUTPUT $(shell pkg-config --cflags starpu-1.3)
+LDLIBS =  $(shell pkg-config --libs starpu-1.3)
+
+OBJS = $(SRCCXX:%.cpp=%.o)
+
+.phony: all clean
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+
+%.o: %.cpp
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+clean:
+	rm -fv *.o $(PROG)

+ 37 - 0
examples/cpp/Makefile.add_vectors_cpp11

@@ -0,0 +1,37 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016  Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+PROG = add_vectors_cpp11
+
+SRCCXX = add_vectors_cpp11.cpp
+
+CC = g++
+
+CFLAGS = -g -std=c++11 -DPRINT_OUTPUT $(shell pkg-config --cflags starpu-1.3)
+LDLIBS =  $(shell pkg-config --libs starpu-1.3)
+
+OBJS = $(SRCCXX:%.cpp=%.o)
+
+.phony: all clean
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+
+%.o: %.cpp
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+clean:
+	rm -fv *.o $(PROG)

+ 148 - 0
examples/cpp/add_vectors.cpp

@@ -0,0 +1,148 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This is a small example of a C++ program using starpu.  We here just
+ * add two std::vector without copying them (0 copy).
+ */
+
+#include <cassert>
+#include <vector>
+
+#ifdef PRINT_OUTPUT
+#include <iostream>
+#endif
+
+#include <starpu.h>
+
+void cpu_kernel_add_vectors(void *buffers[], void *cl_arg)
+{
+	// get the current task
+	starpu_task* task = starpu_task_get_current();
+
+	// get the user data (pointers to the vec_A, vec_B, vec_C std::vector)
+	void* u_data0 = starpu_data_get_user_data(task->handles[0]); assert(u_data0);
+	void* u_data1 = starpu_data_get_user_data(task->handles[1]); assert(u_data1);
+	void* u_data2 = starpu_data_get_user_data(task->handles[2]); assert(u_data2);
+
+	// cast void* in std::vector<char>*
+	std::vector<char>* vec_A = static_cast<std::vector<char>*>(u_data0);
+	std::vector<char>* vec_B = static_cast<std::vector<char>*>(u_data1);
+	std::vector<char>* vec_C = static_cast<std::vector<char>*>(u_data2);
+
+	// all the std::vector have to have the same size
+	assert(vec_A->size() == vec_B->size() && vec_B->size() == vec_C->size());
+
+	// performs the vector addition (vec_C[] = vec_A[] + vec_B[])
+	for (size_t i = 0; i < vec_C->size(); i++)
+		(*vec_C)[i] = (*vec_A)[i] + (*vec_B)[i];
+}
+
+#define VEC_SIZE 1024
+
+int main(int argc, char **argv)
+{
+	std::vector<char> vec_A(VEC_SIZE, 2); // all the vector is initialized to 2
+	std::vector<char> vec_B(VEC_SIZE, 3); // all the vector is initialized to 3
+	std::vector<char> vec_C(VEC_SIZE, 0); // all the vector is initialized to 0
+
+	// initialize StarPU with default configuration
+	int ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	// StarPU data registering
+	starpu_data_handle_t spu_vec_A;
+	starpu_data_handle_t spu_vec_B;
+	starpu_data_handle_t spu_vec_C;
+
+	// give the data of the vector to StarPU (C array)
+	starpu_vector_data_register(&spu_vec_A, STARPU_MAIN_RAM, (uintptr_t)&vec_A[0], vec_A.size(), sizeof(char));
+	starpu_vector_data_register(&spu_vec_B, STARPU_MAIN_RAM, (uintptr_t)&vec_B[0], vec_B.size(), sizeof(char));
+	starpu_vector_data_register(&spu_vec_C, STARPU_MAIN_RAM, (uintptr_t)&vec_C[0], vec_C.size(), sizeof(char));
+
+	// pass the pointer to the C++ vector object to StarPU
+	starpu_data_set_user_data(spu_vec_A, (void*)&vec_A);
+	starpu_data_set_user_data(spu_vec_B, (void*)&vec_B);
+	starpu_data_set_user_data(spu_vec_C, (void*)&vec_C);
+
+	// create the StarPU codelet
+	starpu_codelet cl;
+	starpu_codelet_init(&cl);
+	cl.cpu_funcs     [0] = cpu_kernel_add_vectors;
+	cl.cpu_funcs_name[0] = "cpu_kernel_add_vectors";
+	cl.nbuffers          = 3;
+	cl.modes         [0] = STARPU_R;
+	cl.modes         [1] = STARPU_R;
+	cl.modes         [2] = STARPU_W;
+	cl.name              = "add_vectors";
+
+	// submit a new StarPU task to execute
+	ret = starpu_task_insert(&cl,
+	                         STARPU_R, spu_vec_A,
+	                         STARPU_R, spu_vec_B,
+	                         STARPU_W, spu_vec_C,
+	                         0);
+	if (ret == -ENODEV)
+	{
+		// StarPU data unregistering
+		starpu_data_unregister(spu_vec_C);
+		starpu_data_unregister(spu_vec_B);
+		starpu_data_unregister(spu_vec_A);
+
+		// terminate StarPU, no task can be submitted after
+		starpu_shutdown();
+
+		return 77;
+	}
+
+	STARPU_CHECK_RETURN_VALUE(ret, "task_submit::add_vectors");
+
+	// wait the task
+	starpu_task_wait_for_all();
+
+	// StarPU data unregistering
+	starpu_data_unregister(spu_vec_C);
+	starpu_data_unregister(spu_vec_B);
+	starpu_data_unregister(spu_vec_A);
+
+	// terminate StarPU, no task can be submitted after
+	starpu_shutdown();
+
+	// check results
+	bool fail = false;
+	int i = 0;
+	while (!fail && i < VEC_SIZE)
+		fail = vec_C[i++] != 5;
+
+	if (fail)
+	{
+#ifdef PRINT_OUTPUT
+		std::cout << "Example failed..." << std::endl;
+#endif
+		return EXIT_FAILURE;
+	}
+	else
+	{
+#ifdef PRINT_OUTPUT
+		std::cout << "Example successfully passed!" << std::endl;
+#endif
+		return EXIT_SUCCESS;
+	}
+}

+ 155 - 0
examples/cpp/add_vectors_cpp11.cpp

@@ -0,0 +1,155 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This is a small example of a C++ program using starpu.  We here just
+ * add two std::vector without copying them (0 copy).
+ */
+
+#include <cassert>
+#include <vector>
+
+#ifdef PRINT_OUTPUT
+#include <iostream>
+#endif
+
+#include <starpu.h>
+#if !defined(STARPU_HAVE_CXX11)
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+void cpu_kernel_add_vectors(void *buffers[], void *cl_arg)
+{
+	// get the current task
+	auto task = starpu_task_get_current();
+
+	// get the user data (pointers to the vec_A, vec_B, vec_C std::vector)
+	auto u_data0 = starpu_data_get_user_data(task->handles[0]); assert(u_data0);
+	auto u_data1 = starpu_data_get_user_data(task->handles[1]); assert(u_data1);
+	auto u_data2 = starpu_data_get_user_data(task->handles[2]); assert(u_data2);
+
+	// cast void* in std::vector<char>*
+	auto vec_A = static_cast<std::vector<char>*>(u_data0);
+	auto vec_B = static_cast<std::vector<char>*>(u_data1);
+	auto vec_C = static_cast<std::vector<char>*>(u_data2);
+
+	// all the std::vector have to have the same size
+	assert(vec_A->size() == vec_B->size() && vec_B->size() == vec_C->size());
+
+	// performs the vector addition (vec_C[] = vec_A[] + vec_B[])
+	for (size_t i = 0; i < vec_C->size(); i++)
+		(*vec_C)[i] = (*vec_A)[i] + (*vec_B)[i];
+}
+
+int main(int argc, char **argv)
+{
+	constexpr int vec_size = 1024;
+
+	std::vector<char> vec_A(vec_size, 2); // all the vector is initialized to 2
+	std::vector<char> vec_B(vec_size, 3); // all the vector is initialized to 3
+	std::vector<char> vec_C(vec_size, 0); // all the vector is initialized to 0
+
+	// initialize StarPU with default configuration
+	auto ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	// StarPU data registering
+	starpu_data_handle_t spu_vec_A;
+	starpu_data_handle_t spu_vec_B;
+	starpu_data_handle_t spu_vec_C;
+
+	// give the data of the vector to StarPU (C array)
+	starpu_vector_data_register(&spu_vec_A, STARPU_MAIN_RAM, (uintptr_t)vec_A.data(), vec_A.size(), sizeof(char));
+	starpu_vector_data_register(&spu_vec_B, STARPU_MAIN_RAM, (uintptr_t)vec_B.data(), vec_B.size(), sizeof(char));
+	starpu_vector_data_register(&spu_vec_C, STARPU_MAIN_RAM, (uintptr_t)vec_C.data(), vec_C.size(), sizeof(char));
+
+	// pass the pointer to the C++ vector object to StarPU
+	starpu_data_set_user_data(spu_vec_A, (void*)&vec_A);
+	starpu_data_set_user_data(spu_vec_B, (void*)&vec_B);
+	starpu_data_set_user_data(spu_vec_C, (void*)&vec_C);
+
+	// create the StarPU codelet
+	starpu_codelet cl;
+	starpu_codelet_init(&cl);
+	cl.cpu_funcs     [0] = cpu_kernel_add_vectors;
+	cl.cpu_funcs_name[0] = "cpu_kernel_add_vectors";
+	cl.nbuffers          = 3;
+	cl.modes         [0] = STARPU_R;
+	cl.modes         [1] = STARPU_R;
+	cl.modes         [2] = STARPU_W;
+	cl.name              = "add_vectors";
+
+	// submit a new StarPU task to execute
+	ret = starpu_task_insert(&cl,
+	                         STARPU_R, spu_vec_A,
+	                         STARPU_R, spu_vec_B,
+	                         STARPU_W, spu_vec_C,
+	                         0);
+
+	if (ret == -ENODEV)
+	{
+		// StarPU data unregistering
+		starpu_data_unregister(spu_vec_C);
+		starpu_data_unregister(spu_vec_B);
+		starpu_data_unregister(spu_vec_A);
+
+		// terminate StarPU, no task can be submitted after
+		starpu_shutdown();
+
+		return 77;
+	}
+
+	STARPU_CHECK_RETURN_VALUE(ret, "task_submit::add_vectors");
+
+	// wait the task
+	starpu_task_wait_for_all();
+
+	// StarPU data unregistering
+	starpu_data_unregister(spu_vec_C);
+	starpu_data_unregister(spu_vec_B);
+	starpu_data_unregister(spu_vec_A);
+
+	// terminate StarPU, no task can be submitted after
+	starpu_shutdown();
+
+	// check results
+	auto fail = false;
+	auto i = 0;
+	while (!fail && i < vec_size)
+		fail = vec_C[i++] != 5;
+
+	if (fail)
+	{
+#ifdef PRINT_OUTPUT
+		std::cout << "Example failed..." << std::endl;
+#endif
+		return EXIT_FAILURE;
+	}
+	else
+	{
+#ifdef PRINT_OUTPUT
+		std::cout << "Example successfully passed!" << std::endl;
+#endif
+		return EXIT_SUCCESS;
+	}
+}
+#endif

+ 1 - 0
examples/heat/dw_factolu_grain.c

@@ -379,5 +379,6 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 
 #ifdef CHECK_RESULTS
 	compare_A_LU(Asaved, matA, size, ld);
+	free(Asaved);
 #endif
 }

+ 172 - 0
examples/mlr/mlr.c

@@ -0,0 +1,172 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016 Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examples demonstrates how to use multiple linear regression
+   models.
+
+   First, there is mlr_codelet__init codelet for which we know the
+   parameters, but not the their exponents and relations. This tasks
+   should be benchmarked and analyzed to find the model, using
+   "tools/starpu_mlr_analysis" script as a template.
+
+   For the second (codelet cl_model_final), it is assumed that the
+   analysis has already been performed and that the duration of the
+   codelet mlr_codelet_final will be computed using the following
+   equation:
+
+   T = a + b * (M^2*N) + c * (N^3*K)
+
+   where M, N, K are the parameters of the task, exponents are coming
+   from model->combinations[..][..]  and finally a, b, c are
+   coefficients which mostly depend on the machine speed.
+   
+   These coefficients are going to be automatically computed using
+   least square method.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <starpu.h>
+
+static long sum;
+
+/* Performance function of the task, which is in this case very simple, as the parameter values just need to be written in the array "parameters" */
+static void cl_params(struct starpu_task *task, double *parameters)
+{
+	int m, n, k;
+	starpu_codelet_unpack_args(task->cl_arg, &m, &n, &k);
+	parameters[0] = m;
+	parameters[1] = n;
+	parameters[2] = k;
+}
+
+/* Function of the task that will be executed. In this case running dummy cycles, just to make sure task duration is significant */
+void cpu_func(void *buffers[], void *cl_arg)
+{
+	long i;
+	int m,n,k;
+	starpu_codelet_unpack_args(cl_arg,
+			     	  &m,
+     			     	  &n,
+     			     	  &k);
+	
+	for(i=0; i < (long) (m*m*n); i++)
+		sum+=i;
+
+	for(i=0; i < (long) (n*n*n*k); i++)
+		sum+=i;
+}
+
+/* ############################################ */
+/* Start of the part specific to multiple linear regression perfmodels */
+
+/* Defining perfmodel, number of parameters and their names Initially
+   application developer only knows these parameters. The execution of
+   this codelet will generate traces that can be analyzed using
+   "tools/starpu_mlr_analysis" as a template to obtain the parameters
+   combinations and exponents.
+ */
+
+static const char * parameters_names[]	= {	"M",	"N",	"K", };
+
+static struct starpu_perfmodel cl_model_init = {
+	.type = STARPU_MULTIPLE_REGRESSION_BASED,
+	.symbol = "mlr_init",
+	.parameters = cl_params,
+	.nparameters = 3,
+	.parameters_names = parameters_names,
+};
+
+/* Defining the equation for modeling duration of the task. The
+   parameters combinations and exponents are computed externally
+   offline, for example using "tools/starpu_mlr_analysis" tool as a
+   template.
+ */
+
+static unsigned combi1 [3]		= {	2,	1,	0 };
+static unsigned combi2 [3]		= {	0,	3,	1 };
+
+static unsigned *combinations[] = { combi1, combi2 };
+
+static struct starpu_perfmodel cl_model_final = {
+	.type = STARPU_MULTIPLE_REGRESSION_BASED,
+	.symbol = "mlr_final",
+	.parameters = cl_params,
+	.nparameters = 3,
+	.parameters_names = parameters_names,
+	.ncombinations = 2,
+	.combinations = combinations,
+};
+
+/* End of the part specific to multiple linear regression perfmodels */
+/* ############################################ */
+
+static struct starpu_codelet cl_init = {
+	.cpu_funcs = { cpu_func },
+	.cpu_funcs_name = { "mlr_codelet_init" },
+	.nbuffers = 0,
+	.model = &cl_model_init,
+};
+
+static struct starpu_codelet cl_final = {
+	.cpu_funcs = { cpu_func },
+	.cpu_funcs_name = { "mlr_codelet_final" },
+	.nbuffers = 0,
+	.model = &cl_model_final,
+};
+
+
+int main(int argc, char **argv)
+{
+	/* Initialization */
+	unsigned i,j;
+	int ret;
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	
+	sum=0;
+	int m,n,k;
+
+        /* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
+	for(i=0; i < 42; i++)
+	{
+		m = (int) ((rand() % 10)+1);
+		n = (int) ((rand() % 10)+1);
+		k = (int) ((rand() % 10)+1);
+		
+		for(j=0; j < 42; j++)
+		{
+			starpu_insert_task(&cl_init,
+				   STARPU_VALUE, &m, sizeof(int),
+				   STARPU_VALUE, &n, sizeof(int),
+				   STARPU_VALUE, &k, sizeof(int),
+				   0);
+			starpu_insert_task(&cl_final,
+				   STARPU_VALUE, &m, sizeof(int),
+				   STARPU_VALUE, &n, sizeof(int),
+				   STARPU_VALUE, &k, sizeof(int),
+				   0);
+		}
+	}
+			  
+	starpu_shutdown();
+
+	return 0;
+}

+ 3 - 2
examples/ppm_downscaler/yuv_downscaler.c

@@ -112,6 +112,7 @@ static struct starpu_data_filter filter_uv =
 int main(int argc, char **argv)
 {
 	int ret;
+	size_t sret;
 
 	assert(HEIGHT % (2*BLOCK_HEIGHT) == 0);
 	assert(HEIGHT % FACTOR == 0);
@@ -146,8 +147,8 @@ int main(int argc, char **argv)
 	FILE *f_out = fopen(filename_out, "w+");
 	assert(f_out);
 
-	ret = fread(yuv_in_buffer, FRAMESIZE, nframes, f_in);
-	assert(ret == nframes);
+	sret = fread(yuv_in_buffer, FRAMESIZE, nframes, f_in);
+	assert(sret == nframes);
 
 	starpu_data_handle_t *frame_y_handle = (starpu_data_handle_t *)  calloc(nframes, sizeof(starpu_data_handle_t));
 	starpu_data_handle_t *frame_u_handle = (starpu_data_handle_t *)  calloc(nframes, sizeof(starpu_data_handle_t));

+ 4 - 1
examples/sched_ctx/parallel_tasks_reuse_handle.c

@@ -68,6 +68,8 @@ static struct starpu_codelet init_parallel_worker_cl=
 void parallel_task_init_one_context(unsigned * context_id)
 {
 	struct starpu_task * t;
+	int ret;
+
 	t = starpu_task_build(&init_parallel_worker_cl,
 			      STARPU_SCHED_CTX, *context_id,
 			      0);
@@ -78,7 +80,8 @@ void parallel_task_init_one_context(unsigned * context_id)
 	t->prologue_callback_pop_arg=context_id;
 	t->prologue_callback_pop_arg_free=0;
 
-	starpu_task_submit(t);
+	ret = starpu_task_submit(t);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
 struct context main_context;

+ 4 - 0
examples/scheduler/dummy_sched.c

@@ -156,6 +156,10 @@ int main(int argc, char **argv)
 	int ret;
 	struct starpu_conf conf;
 
+#ifdef STARPU_HAVE_UNSETENV
+	unsetenv("STARPU_SCHED");
+#endif
+
 	starpu_conf_init(&conf);
 	conf.sched_policy = &dummy_sched_policy,
 	ret = starpu_init(&conf);

+ 1 - 1
examples/scheduler/heteroprio_test.c

@@ -117,7 +117,7 @@ int main(int argc, char** argv)
 
 	ncpus = starpu_cpu_worker_get_count();
 	nopencls = starpu_opencl_worker_get_count();
-	FPRINTF(stderr, "Worker = %d\n",  starpu_worker_get_count());
+	FPRINTF(stderr, "Worker = %u\n",  starpu_worker_get_count());
 	FPRINTF(stderr, "Worker CPU = %d\n", ncpus);
 	FPRINTF(stderr, "Worker OpenCL = %d\n", nopencls);
 	if (ncpus + nopencls == 0)

+ 7 - 6
examples/spmv/dw_block_spmv.c

@@ -63,7 +63,7 @@ void create_data(void)
 
 	/* declare the corresponding block CSR to the runtime */
 	starpu_bcsr_data_register(&sparse_matrix, STARPU_MAIN_RAM, bcsr_matrix->nnz_blocks, bcsr_matrix->nrows_blocks,
-	                (uintptr_t)bcsr_matrix->val, bcsr_matrix->colind, bcsr_matrix->rowptr, 
+	                (uintptr_t)bcsr_matrix->val, bcsr_matrix->colind, bcsr_matrix->rowptr,
 			0, bcsr_matrix->r, bcsr_matrix->c, sizeof(float));
 
 	size = c*r*starpu_bcsr_get_nnz(sparse_matrix);
@@ -125,7 +125,7 @@ unsigned get_bcsr_nchildren(STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f
   return (unsigned)starpu_bcsr_get_nnz(handle);
 }
 
-struct starpu_data_interface_ops *get_bcsr_child_ops(STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, STARPU_ATTRIBUTE_UNUSED unsigned child) 
+struct starpu_data_interface_ops *get_bcsr_child_ops(STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, STARPU_ATTRIBUTE_UNUSED unsigned child)
 {
   return &starpu_interface_matrix_ops;
 }
@@ -145,7 +145,7 @@ void call_filters(void)
 	vector_in_f.nchildren  = size/c;
 	vector_in_f.get_nchildren  = NULL;
 	vector_in_f.get_child_ops  = NULL;
-	
+
 	vector_out_f.filter_func = starpu_vector_filter_block;
 	vector_out_f.nchildren  = size/r;
 	vector_out_f.get_nchildren  = NULL;
@@ -178,8 +178,8 @@ void launch_spmv_codelets(void)
 	int ret;
 
 	/* we call one codelet per block */
-	unsigned nblocks = starpu_bcsr_get_nnz(sparse_matrix); 
-	unsigned nrows = starpu_bcsr_get_nrow(sparse_matrix); 
+	unsigned nblocks = starpu_bcsr_get_nnz(sparse_matrix);
+	unsigned nrows = starpu_bcsr_get_nrow(sparse_matrix);
 
 	remainingtasks = NSPMV*nblocks;
 	totaltasks = remainingtasks;
@@ -235,7 +235,7 @@ void launch_spmv_codelets(void)
 				task->handles[1] = starpu_data_get_sub_data(vector_in, 1, i);
 				task->handles[2] = starpu_data_get_sub_data(vector_out, 1, j);
 
-				/* all tasks in the same row are dependant so that we don't wait too much for data 
+				/* all tasks in the same row are dependant so that we don't wait too much for data
 				 * we need to wait on the previous task if we are not the first task of a row */
 				if (index != rowptr[row & ~0x3])
 				{
@@ -272,6 +272,7 @@ void launch_spmv_codelets(void)
 	}
 
 	printf("end of task submission (there was %u chains for %u tasks : ratio %u tasks per chain) !\n", nchains, totaltasks, totaltasks/nchains);
+	free(is_entry_tab);
 }
 
 void init_problem(void)

+ 7 - 7
examples/spmv/matrix_market/mmio.c

@@ -103,12 +103,12 @@ int mm_is_valid(MM_typecode matcode)
 
 int mm_read_banner(FILE *f, MM_typecode *matcode)
 {
-	char line[MM_MAX_LINE_LENGTH];
-	char banner[MM_MAX_TOKEN_LENGTH];
-	char mtx[MM_MAX_TOKEN_LENGTH];
-	char crd[MM_MAX_TOKEN_LENGTH];
-	char data_type[MM_MAX_TOKEN_LENGTH];
-	char storage_scheme[MM_MAX_TOKEN_LENGTH];
+	char line[MM_MAX_LINE_LENGTH+1];
+	char banner[MM_MAX_TOKEN_LENGTH+1];
+	char mtx[MM_MAX_TOKEN_LENGTH+1];
+	char crd[MM_MAX_TOKEN_LENGTH+1];
+	char data_type[MM_MAX_TOKEN_LENGTH+1];
+	char storage_scheme[MM_MAX_TOKEN_LENGTH+1];
 	char *p;
 
 	mm_clear_typecode(matcode);
@@ -116,7 +116,7 @@ int mm_read_banner(FILE *f, MM_typecode *matcode)
 	if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
 		return MM_PREMATURE_EOF;
 
-	if (sscanf(line, "%MM_MAX_TOKEN_LENGTHs %MM_MAX_TOKEN_LENGTHs %MM_MAX_TOKEN_LENGTHs %MM_MAX_TOKEN_LENGTHs %MM_MAX_TOKEN_LENGTHs", banner, mtx, crd, data_type, storage_scheme) != 5)
+	if (sscanf(line, "%"MM_MAX_TOKEN_LENGTH_S"s %"MM_MAX_TOKEN_LENGTH_S"s %"MM_MAX_TOKEN_LENGTH_S"s %"MM_MAX_TOKEN_LENGTH_S"s %"MM_MAX_TOKEN_LENGTH_S"s", banner, mtx, crd, data_type, storage_scheme) != 5)
 		return MM_PREMATURE_EOF;
 
 	for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */

+ 1 - 0
examples/spmv/matrix_market/mmio.h

@@ -27,6 +27,7 @@
 #define MM_MAX_LINE_LENGTH 1025
 #define MatrixMarketBanner "%%MatrixMarket"
 #define MM_MAX_TOKEN_LENGTH 64
+#define MM_MAX_TOKEN_LENGTH_S "64"
 
 typedef char MM_typecode[4];
 

+ 29 - 2
examples/stencil/Makefile.am

@@ -93,12 +93,14 @@ endif
 ###################
 
 STARPU_EXAMPLES =				\
-	stencil
+	stencil					\
+	implicit_stencil
 
 examplebindir = $(libdir)/starpu/examples/stencil
 
 examplebin_PROGRAMS =				\
-	stencil
+	stencil					\
+	implicit_stencil
 
 stencil_SOURCES =				\
 	life.c					\
@@ -109,6 +111,7 @@ stencil_SOURCES =				\
 
 noinst_HEADERS =				\
 	stencil.h				\
+	implicit-stencil.h			\
 	shadow.h
 
 if STARPU_USE_CUDA
@@ -123,6 +126,30 @@ stencil_SOURCES +=				\
 	shadow_opencl.c
 endif
 
+implicit_stencil_SOURCES =			\
+	life.c					\
+	implicit-stencil-kernels.c		\
+	implicit-stencil-tasks.c		\
+	implicit-stencil-blocks.c		\
+	implicit-stencil.c
+
+noinst_HEADERS =				\
+	stencil.h				\
+	implicit-stencil.h			\
+	shadow.h
+
+if STARPU_USE_CUDA
+implicit_stencil_SOURCES +=			\
+	life_cuda.cu				\
+	shadow.cu
+endif
+
+if STARPU_USE_OPENCL
+implicit_stencil_SOURCES +=			\
+	life_opencl.c				\
+	shadow_opencl.c
+endif
+
 outs =						\
 	0.5.out					\
 	0.out					\

+ 4 - 0
examples/stencil/README

@@ -42,3 +42,7 @@ penalty ratio), run make pics or make view to get pictures.
 mpi.out: results on MPI.
 
 results: a few results
+
+You can also use the implicit distributed flavour of this application (e.g.
+with communications between processes automatically inferred by StarPU-MPI),
+which is called implicit_stencil.

+ 444 - 0
examples/stencil/implicit-stencil-blocks.c

@@ -0,0 +1,444 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013-2016  Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "implicit-stencil.h"
+#include <math.h>
+
+/* Manage block and tags allocation */
+
+static struct block_description *blocks;
+static unsigned sizex, sizey, sizez;
+static unsigned nbz;
+static unsigned *block_sizes_z;
+
+/*
+ *	Tags for various codelet completion
+ */
+
+/*
+ * common tag format:
+ */
+static starpu_tag_t tag_common(int z, int dir, int type)
+{
+	return (((((starpu_tag_t)type) << 4) | ((dir+1)/2)) << 32)|(starpu_tag_t)z;
+}
+
+/* Completion of last update tasks */
+starpu_tag_t TAG_FINISH(int z)
+{
+	z = (z + nbz)%nbz;
+
+	starpu_tag_t tag = tag_common(z, 0, 1);
+	return tag;
+}
+
+/* Completion of the save codelet for MPI send/recv */
+starpu_tag_t TAG_START(int z, int dir)
+{
+	z = (z + nbz)%nbz;
+
+	starpu_tag_t tag = tag_common(z, dir, 2);
+	return tag;
+}
+
+/*
+ * common MPI tag format:
+ */
+static int mpi_tag_common(int z, int dir, int layer_or_boundary, int buffer)
+{
+	return (z<<12) | (layer_or_boundary << 8) | ((((1+dir)/2))<<4) | buffer;
+}
+
+int MPI_TAG_LAYERS(int z, int buffer)
+{
+	z = (z + nbz)%nbz;
+
+    /* No direction for layers ; layer is 0 */
+	int tag = mpi_tag_common(z, 0, 0, buffer);
+
+	return tag;
+}
+
+int MPI_TAG_BOUNDARIES(int z, int dir, int buffer)
+{
+	z = (z + nbz)%nbz;
+
+	int tag = mpi_tag_common(z, dir, 1, buffer);
+
+	return tag;
+}
+
+
+/*
+ *	Block descriptors
+ */
+
+/* Compute the size of the different blocks */
+static void compute_block_sizes(void)
+{
+	block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
+	STARPU_ASSERT(block_sizes_z);
+
+	/* Perhaps the last chunk is smaller */
+	unsigned default_block_size = (sizez+nbz-1)/nbz;
+	unsigned remaining = sizez;
+
+	unsigned b;
+	for (b = 0; b < nbz; b++)
+	{
+		block_sizes_z[b] = MIN(default_block_size, remaining);
+		remaining -= block_sizes_z[b];
+	}
+
+	STARPU_ASSERT(remaining == 0);
+}
+
+unsigned get_block_size(int bz)
+{
+	return block_sizes_z[bz];
+}
+
+struct block_description *get_block_description(int z)
+{
+	z = (z + nbz)%nbz;
+
+	STARPU_ASSERT(&blocks[z]);
+
+	return &blocks[z];
+}
+
+int get_block_mpi_node(int z)
+{
+	z = (z + nbz)%nbz;
+	return blocks[z].mpi_node;
+}
+
+void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsigned _nbz)
+{
+	/* Store the parameters */
+	nbz = _nbz;
+	sizex = _sizex;
+	sizey = _sizey;
+	sizez = _sizez;
+
+	/* Create a grid of block descriptors */
+	blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
+	STARPU_ASSERT(blocks);
+
+	/* What is the size of the different blocks ? */
+	compute_block_sizes();
+
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description * block =
+				get_block_description(bz);
+
+		/* Which block is it ? */
+		block->bz = bz;
+
+		/* For simplicity, we store which are the neighbours blocks */
+		block->boundary_blocks[B] = get_block_description((bz-1+nbz)%nbz);
+		block->boundary_blocks[T] = get_block_description((bz+1)%nbz);
+	}
+}
+
+void free_blocks_array()
+{
+	free(blocks);
+	free(block_sizes_z);
+}
+
+/*
+ *	Initialization of the blocks
+ */
+
+void assign_blocks_to_workers(int rank)
+{
+	unsigned bz;
+
+	/* NB: perhaps we could count a GPU as multiple workers */
+
+	/* how many workers are there ? */
+	/*unsigned nworkers = starpu_worker_get_count();*/
+
+	/* how many blocks are on that MPI node ? */
+	unsigned nblocks = 0;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block =
+				get_block_description(bz);
+
+		if (block->mpi_node == rank)
+			nblocks++;
+	}
+
+	/* how many blocks per worker ? */
+	/*unsigned nblocks_per_worker = (nblocks + nworkers - 1)/nworkers;*/
+
+	/* we now attribute up to nblocks_per_worker blocks per workers */
+	unsigned attributed = 0;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block =
+				get_block_description(bz);
+
+		if (block->mpi_node == rank)
+		{
+			unsigned workerid;
+			/* Manage initial block distribution between CPU and GPU */
+		#if 0
+			#if 1
+			/* GPUs then CPUs */
+			if (attributed < 3*18)
+				workerid = attributed / 18;
+			else
+				workerid = 3+ (attributed - 3*18) / 2;
+			#else
+			/* GPUs interleaved with CPUs */
+			if ((attributed % 20) <= 1)
+				workerid = 3 + attributed / 20;
+			else if (attributed < 60)
+				workerid = attributed / 20;
+			else
+				workerid = (attributed - 60)/2 + 6;
+			#endif
+		#else
+			/* Only GPUS */
+			workerid = (attributed / 21) % 3;
+		#endif
+			/*= attributed/nblocks_per_worker;*/
+
+			block->preferred_worker = workerid;
+
+			attributed++;
+		}
+	}
+}
+
+
+
+void assign_blocks_to_mpi_nodes(int world_size)
+{
+	unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
+
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block =
+				get_block_description(bz);
+
+		block->mpi_node = bz / nzblocks_per_process;
+	}
+}
+
+static size_t allocated = 0;
+
+static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
+{
+	int ret;
+	size_t block_size = nx*ny*nz*sizeof(TYPE);
+
+	/* Allocate memory */
+#if 1
+	ret = starpu_malloc_flags((void **)ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	STARPU_ASSERT(ret == 0);
+#else
+	*ptr = malloc(block_size);
+	STARPU_ASSERT(*ptr);
+#endif
+
+	allocated += block_size;
+
+//#ifndef STARPU_SIMGRID
+//	/* Fill the blocks with 0 */
+//	memset(*ptr, 0, block_size);
+//#endif
+
+	/* Register it to StarPU */
+	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
+}
+
+static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsigned ny, unsigned nz)
+{
+	void *ptr = (void *) starpu_block_get_local_ptr(handleptr);
+	size_t block_size = nx*ny*nz*sizeof(TYPE);
+	starpu_data_unregister(handleptr);
+	starpu_free_flags(ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+}
+
+void display_memory_consumption(int rank, double time)
+{
+	FPRINTF(stderr, "%lu B of memory were allocated on node %d in %f ms\n", (unsigned long)allocated, rank, time/1000);
+}
+
+void allocate_memory_on_node(int rank)
+{
+	unsigned bz;
+
+	/* Correctly allocate and declare all data handles to StarPU. */
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+		int node = block->mpi_node;
+		unsigned size_bz = block_sizes_z[bz];
+
+		if (node == rank)
+		{
+			/* Main blocks */
+			allocate_block_on_node(&block->layers_handle[0], &block->layers[0],
+					       (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
+			allocate_block_on_node(&block->layers_handle[1], &block->layers[1],
+					       (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
+
+			/* Boundary blocks : Top */
+			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
+					       (sizex + 2*K), (sizey + 2*K), K);
+			allocate_block_on_node(&block->boundaries_handle[T][1], &block->boundaries[T][1],
+					       (sizex + 2*K), (sizey + 2*K), K);
+
+			/* Boundary blocks : Bottom */
+			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
+					       (sizex + 2*K), (sizey + 2*K), K);
+			allocate_block_on_node(&block->boundaries_handle[B][1], &block->boundaries[B][1],
+					       (sizex + 2*K), (sizey + 2*K), K);
+		}
+		/* Register void blocks to StarPU, that StarPU-MPI will request to
+		 * neighbour nodes if needed for the local computation */
+		else
+		{
+			/* Main blocks */
+			starpu_block_data_register(&block->layers_handle[0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K), sizeof(TYPE));
+			starpu_block_data_register(&block->layers_handle[1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K), sizeof(TYPE));
+
+			/* Boundary blocks : Top */
+			starpu_block_data_register(&block->boundaries_handle[T][0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
+			starpu_block_data_register(&block->boundaries_handle[T][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
+
+			/* Boundary blocks : Bottom */
+			starpu_block_data_register(&block->boundaries_handle[B][0], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
+			starpu_block_data_register(&block->boundaries_handle[B][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
+		}
+
+#ifdef STARPU_USE_MPI
+		/* Register all data to StarPU-MPI, even the ones that are not
+		 * allocated on the local node. */
+
+		/* Main blocks */
+		starpu_mpi_data_register(block->layers_handle[0], MPI_TAG_LAYERS(bz, 0), node);
+		starpu_mpi_data_register(block->layers_handle[1], MPI_TAG_LAYERS(bz, 1), node);
+
+		/* Boundary blocks : Top */
+		starpu_mpi_data_register(block->boundaries_handle[T][0], MPI_TAG_BOUNDARIES(bz, T, 0), node);
+		starpu_mpi_data_register(block->boundaries_handle[T][1], MPI_TAG_BOUNDARIES(bz, T, 1), node);
+
+		/* Boundary blocks : Bottom */
+		starpu_mpi_data_register(block->boundaries_handle[B][0], MPI_TAG_BOUNDARIES(bz, B, 0), node);
+		starpu_mpi_data_register(block->boundaries_handle[B][1], MPI_TAG_BOUNDARIES(bz, B, 1), node);
+#endif
+	}
+
+	/* Initialize all the data in parallel */
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+		int node = block->mpi_node;
+
+		if (node == rank)
+		{
+			/* Set all the data to 0 */
+			create_task_memset(sizex, sizey, bz);
+
+			/* Initialize the first layer with some random data */
+			create_task_initlayer(sizex, sizey, bz);
+		}
+	}
+	starpu_task_wait_for_all();
+}
+
+void free_memory_on_node(int rank)
+{
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+
+		int node = block->mpi_node;
+
+		/* Main blocks */
+		if (node == rank)
+		{
+			free_block_on_node(block->layers_handle[0], (sizex + 2*K), (sizey + 2*K), K);
+			free_block_on_node(block->layers_handle[1], (sizex + 2*K), (sizey + 2*K), K);
+		}
+        else
+        {
+            starpu_data_unregister(block->layers_handle[0]);
+            starpu_data_unregister(block->layers_handle[1]);
+        }
+
+		/* Boundary blocks : Top */
+		if (node == rank)
+		{
+			free_block_on_node(block->boundaries_handle[T][0], (sizex + 2*K), (sizey + 2*K), K);
+			free_block_on_node(block->boundaries_handle[T][1], (sizex + 2*K), (sizey + 2*K), K);
+		}
+        else
+        {
+            starpu_data_unregister(block->boundaries_handle[T][0]);
+            starpu_data_unregister(block->boundaries_handle[T][1]);
+        }
+
+		/* Boundary blocks : Bottom */
+		if (node == rank)
+		{
+			free_block_on_node(block->boundaries_handle[B][0], (sizex + 2*K), (sizey + 2*K), K);
+			free_block_on_node(block->boundaries_handle[B][1], (sizex + 2*K), (sizey + 2*K), K);
+		}
+        else
+        {
+            starpu_data_unregister(block->boundaries_handle[B][0]);
+            starpu_data_unregister(block->boundaries_handle[B][1]);
+        }
+	}
+}
+
+/* check how many cells are alive */
+void check(int rank)
+{
+	unsigned bz;
+	for (bz = 0; bz < nbz; bz++)
+	{
+		struct block_description *block = get_block_description(bz);
+
+		int node = block->mpi_node;
+
+		/* Main blocks */
+		if (node == rank)
+		{
+			unsigned size_bz = block_sizes_z[bz];
+#ifdef LIFE
+			unsigned x, y, z;
+			unsigned sum = 0;
+			for (x = 0; x < sizex; x++)
+				for (y = 0; y < sizey; y++)
+					for (z = 0; z < size_bz; z++)
+						sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)];
+			printf("block %u got %u/%u alive\n", bz, sum, sizex*sizey*size_bz);
+#endif
+		}
+	}
+}

+ 769 - 0
examples/stencil/implicit-stencil-kernels.c

@@ -0,0 +1,769 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2015  Université de Bordeaux
+ * Copyright (C) 2012, 2013, 2016  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "implicit-stencil.h"
+
+/* Computation Kernels */
+
+/*
+ * There are three codeletets:
+ *
+ * - cl_update, which takes a block and the boundaries of its neighbours, loads
+ *   the boundaries into the block and perform some update loops:
+ *
+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
+ *   |     ...     |                                                                                      
+ *   |             | +------------------+ +------------------+                                            
+ *   |     #N+1    | | #N+1 bottom copy====>#N+1 bottom copy |                                            
+ *   +-------------+ +------------------+ +------------------+                                            
+ *   | #N top copy | |   #N top copy    | |                  |                                            
+ *   +-------------+ +------------------+ |                  |                                            
+ *                                        | #N               |                                            
+ *                                                 ...                                                    
+ *                                        |                  | +----------------+ +----------------------+
+ *                                        |                  | | #N bottom copy | | block #N bottom copy |
+ * ^                                      +------------------+ +----------------+ +----------------------+
+ * |                                      | #N-1 top copy   <====#N-1 top copy  | |  block #N-1          |
+ * |                                      +------------------+ +----------------+ |                      |
+ * Z                                                                                     ...
+ *
+ * - save_cl_top, which take a block and its top boundary, and saves the top of
+ *   the block into the boundary (to be given as bottom of the neighbour above
+ *   this block).
+ *
+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
+ *   |     ...     |                                                                                      
+ *   |             | +------------------+ +------------------+                                            
+ *   |     #N+1    | | #N+1 bottom copy | | #N+1 bottom copy |                                            
+ *   +-------------+ +------------------+ +------------------+                                            
+ *   | #N top copy | |   #N top copy   <====                 |                                            
+ *   +-------------+ +------------------+ |..................|                                            
+ *                                        | #N               |                                            
+ *                                                 ...                                                    
+ *                                        |                  | +----------------+ +----------------------+
+ *                                        |                  | | #N bottom copy | | block #N bottom copy |
+ * ^                                      +------------------+ +----------------+ +----------------------+
+ * |                                      | #N-1 top copy    | | #N-1 top copy  | |  block #N-1          |
+ * |                                      +------------------+ +----------------+ |                      |
+ * Z                                                                                     ...
+ *
+ * - save_cl_bottom, same for the bottom
+ *     comp. buffer      save. buffers        comp. buffer        save. buffers        comp. buffer
+ *   |     ...     |                                                                                      
+ *   |             | +------------------+ +------------------+                                            
+ *   |     #N+1    | | #N+1 bottom copy | | #N+1 bottom copy |                                            
+ *   +-------------+ +------------------+ +------------------+                                            
+ *   | #N top copy | |   #N top copy    | |                  |                                            
+ *   +-------------+ +------------------+ |                  |                                            
+ *                                        | #N               |                                            
+ *                                                 ...                                                    
+ *                                        |..................| +----------------+ +----------------------+
+ *                                        |                 ====>#N bottom copy | | block #N bottom copy |
+ * ^                                      +------------------+ +----------------+ +----------------------+
+ * |                                      | #N-1 top copy    | | #N-1 top copy  | |  block #N-1          |
+ * |                                      +------------------+ +----------------+ |                      |
+ * Z                                                                                     ...
+ *
+ * The idea is that the computation buffers thus don't have to move, only their
+ * boundaries are copied to buffers that do move (be it CPU/GPU, GPU/GPU or via
+ * MPI)
+ *
+ * For each of the buffers above, there are two (0/1) buffers to make new/old switch costless.
+ */
+
+#if 0
+# define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
+#else
+# define DEBUG(fmt, ...) (void) 0
+#endif
+
+/* Record which GPU ran which block, for nice pictures */
+int who_runs_what_len;
+int *who_runs_what;
+int *who_runs_what_index;
+double *last_tick;
+
+/* Achieved iterations */
+static int achieved_iter;
+
+/* Record how many updates each worker performed */
+unsigned update_per_worker[STARPU_NMAXWORKERS];
+
+static void record_who_runs_what(struct block_description *block)
+{
+	double now, now2, diff, delta = get_ticks() * 1000;
+	int workerid = starpu_worker_get_id_check();
+
+	now = starpu_timing_now();
+	now2 = now - start;
+	diff = now2 - last_tick[block->bz];
+	while (diff >= delta)
+	{
+		last_tick[block->bz] += delta;
+		diff = now2 - last_tick[block->bz];
+		if (who_runs_what_index[block->bz] < who_runs_what_len)
+			who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
+	}
+
+	if (who_runs_what_index[block->bz] < who_runs_what_len)
+		who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
+}
+
+static void check_load(struct starpu_block_interface *block, struct starpu_block_interface *boundary)
+{
+	/* Sanity checks */
+	STARPU_ASSERT(block->nx == boundary->nx);
+	STARPU_ASSERT(block->ny == boundary->ny);
+	STARPU_ASSERT(boundary->nz == K);
+
+	/* NB: this is not fully garanteed ... but it's *very* likely and that
+	 * makes our life much simpler */
+	STARPU_ASSERT(block->ldy == boundary->ldy);
+	STARPU_ASSERT(block->ldz == boundary->ldz);
+}
+
+/*
+ * Load a neighbour's boundary into block, CPU version
+ */
+static void load_subblock_from_buffer_cpu(void *_block,
+					void *_boundary,
+					unsigned firstz)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	TYPE *block_data = (TYPE *)block->ptr;
+	TYPE *boundary_data = (TYPE *)boundary->ptr;
+	memcpy(&block_data[offset], boundary_data, boundary_size);
+}
+
+/*
+ * Load a neighbour's boundary into block, CUDA version
+ */
+#ifdef STARPU_USE_CUDA
+static void load_subblock_from_buffer_cuda(void *_block,
+					void *_boundary,
+					unsigned firstz)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	TYPE *block_data = (TYPE *)block->ptr;
+	TYPE *boundary_data = (TYPE *)boundary->ptr;
+	cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+}
+
+/*
+ * cl_update (CUDA version)
+ */
+static void update_func_cuda(void *descr[], void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if (block->bz == 0)
+		FPRINTF(stderr,"!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
+	else
+		DEBUG( "!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int rank = 0;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	DEBUG( "!!!           RANK %d              !!!\n", rank);
+#endif
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+
+	unsigned block_size_z = get_block_size(block->bz);
+	unsigned i;
+	update_per_worker[workerid]++;
+
+	record_who_runs_what(block);
+
+	/*
+	 *	Load neighbours' boundaries : TOP
+	 */
+
+	/* The offset along the z axis is (block_size_z + K) */
+	load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K);
+	load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K);
+
+	/*
+	 *	Load neighbours' boundaries : BOTTOM
+	 */
+	load_subblock_from_buffer_cuda(descr[0], descr[4], 0);
+	load_subblock_from_buffer_cuda(descr[1], descr[5], 0);
+
+	/*
+	 *	Stencils ... do the actual work here :) TODO
+	 */
+
+	for (i=1; i<=K; i++)
+	{
+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
+		TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
+
+		/* Shadow data */
+		cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+
+		/* And perform actual computation */
+#ifdef LIFE
+		cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+#else
+		cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+#endif /* LIFE */
+	}
+
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
+}
+#endif /* STARPU_USE_CUDA */
+
+/*
+ * Load a neighbour's boundary into block, OpenCL version
+ */
+#ifdef STARPU_USE_OPENCL
+static void load_subblock_from_buffer_opencl(struct starpu_block_interface *block,
+					struct starpu_block_interface *boundary,
+					unsigned firstz)
+{
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	cl_mem block_data = (cl_mem)block->dev_handle;
+	cl_mem boundary_data = (cl_mem)boundary->dev_handle;
+
+        cl_command_queue cq;
+        starpu_opencl_get_current_queue(&cq);
+        cl_int ret = clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
+	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
+}
+
+/*
+ * cl_update (OpenCL version)
+ */
+static void update_func_opencl(void *descr[], void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if (block->bz == 0)
+		FPRINTF(stderr,"!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
+	else
+		DEBUG( "!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int rank = 0;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	DEBUG( "!!!           RANK %d              !!!\n", rank);
+#endif
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+
+	unsigned block_size_z = get_block_size(block->bz);
+	unsigned i;
+	update_per_worker[workerid]++;
+
+	record_who_runs_what(block);
+
+        cl_command_queue cq;
+        starpu_opencl_get_current_queue(&cq);
+
+	/*
+	 *	Load neighbours' boundaries : TOP
+	 */
+
+	/* The offset along the z axis is (block_size_z + K) */
+	load_subblock_from_buffer_opencl(descr[0], descr[2], block_size_z+K);
+	load_subblock_from_buffer_opencl(descr[1], descr[3], block_size_z+K);
+
+	/*
+	 *	Load neighbours' boundaries : BOTTOM
+	 */
+	load_subblock_from_buffer_opencl(descr[0], descr[4], 0);
+	load_subblock_from_buffer_opencl(descr[1], descr[5], 0);
+
+	/*
+	 *	Stencils ... do the actual work here :) TODO
+	 */
+
+	for (i=1; i<=K; i++)
+	{
+		struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
+		TYPE *old = (void*) oldb->dev_handle, *newer = (void*) newb->dev_handle;
+
+		/* Shadow data */
+		opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+
+		/* And perform actual computation */
+#ifdef LIFE
+		opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+#else
+		cl_event event;
+                cl_int ret = clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
+		if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
+
+#endif /* LIFE */
+	}
+
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
+}
+#endif /* STARPU_USE_OPENCL */
+
+/*
+ * cl_update (CPU version)
+ */
+void update_func_cpu(void *descr[], void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+	if (block->bz == 0)
+		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
+	else
+		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int rank = 0;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	DEBUG( "!!!           RANK %d            !!!\n", rank);
+#endif
+	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+
+	unsigned block_size_z = get_block_size(block->bz);
+	unsigned i;
+	update_per_worker[workerid]++;
+
+	record_who_runs_what(block);
+
+	/*
+	 *	Load neighbours' boundaries : TOP
+	 */
+
+	/* The offset along the z axis is (block_size_z + K) */
+	load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K);
+	load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K);
+
+	/*
+	 *	Load neighbours' boundaries : BOTTOM
+	 */
+	load_subblock_from_buffer_cpu(descr[0], descr[4], 0);
+	load_subblock_from_buffer_cpu(descr[1], descr[5], 0);
+
+	/*
+	 *	Stencils ... do the actual work here :) TODO
+	 */
+
+	for (i=1; i<=K; i++)
+	{
+		struct starpu_block_interface *oldb = (struct starpu_block_interface *) descr[i%2], *newb = (struct starpu_block_interface *) descr[(i+1)%2];
+		TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
+
+		/* Shadow data */
+		unsigned ldy = oldb->ldy, ldz = oldb->ldz;
+		unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz;
+		unsigned x, y, z;
+		unsigned stepx = 1;
+		unsigned stepy = 1;
+		unsigned stepz = 1;
+		unsigned idx = 0;
+		unsigned idy = 0;
+		unsigned idz = 0;
+		TYPE *ptr = old;
+
+#		include "shadow.h"
+
+		/* And perform actual computation */
+#ifdef LIFE
+		life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
+#else
+		memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
+#endif /* LIFE */
+	}
+
+	if (block->bz == 0)
+		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
+}
+
+/* Performance model and codelet structure */
+static struct starpu_perfmodel cl_update_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "cl_update" 
+};
+
+struct starpu_codelet cl_update =
+{
+	.cpu_funcs = {update_func_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {update_func_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {update_func_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.model = &cl_update_model,
+	.nbuffers = 6,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
+};
+
+/*
+ * Save the block internal boundaries to give them to our neighbours.
+ */
+
+/* CPU version */
+static void load_subblock_into_buffer_cpu(void *_block,
+					void *_boundary,
+					unsigned firstz)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	TYPE *block_data = (TYPE *)block->ptr;
+	TYPE *boundary_data = (TYPE *)boundary->ptr;
+	memcpy(boundary_data, &block_data[offset], boundary_size);
+}
+
+/* CUDA version */
+#ifdef STARPU_USE_CUDA
+static void load_subblock_into_buffer_cuda(void *_block,
+					void *_boundary,
+					unsigned firstz)
+{
+	struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
+	struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	TYPE *block_data = (TYPE *)block->ptr;
+	TYPE *boundary_data = (TYPE *)boundary->ptr;
+	cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+}
+#endif /* STARPU_USE_CUDA */
+
+/* OPENCL version */
+#ifdef STARPU_USE_OPENCL
+static void load_subblock_into_buffer_opencl(struct starpu_block_interface *block,
+					struct starpu_block_interface *boundary,
+					unsigned firstz)
+{
+	check_load(block, boundary);
+
+	/* We do a contiguous memory transfer */
+	size_t boundary_size = K*block->ldz*block->elemsize;
+
+	unsigned offset = firstz*block->ldz;
+	cl_mem block_data = (cl_mem)block->dev_handle;
+	cl_mem boundary_data = (cl_mem)boundary->dev_handle;
+
+        cl_command_queue cq;
+        starpu_opencl_get_current_queue(&cq);
+
+        cl_int ret = clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
+	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
+}
+#endif /* STARPU_USE_OPENCL */
+
+/* Record how many top/bottom saves each worker performed */
+unsigned top_per_worker[STARPU_NMAXWORKERS];
+unsigned bottom_per_worker[STARPU_NMAXWORKERS];
+
+/* top save, CPU version */
+void dummy_func_top_cpu(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	top_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Bottom block %d\n", block->bz);
+
+	/* The offset along the z axis is (block_size_z + K)- K */
+	unsigned block_size_z = get_block_size(block->bz);
+
+	load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z);
+	load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z);
+}
+
+/* bottom save, CPU version */
+void dummy_func_bottom_cpu(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+    STARPU_ASSERT(block);
+
+	int workerid = starpu_worker_get_id_check();
+	bottom_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Top block %d\n", block->bz);
+
+	load_subblock_into_buffer_cpu(descr[0], descr[2], K);
+	load_subblock_into_buffer_cpu(descr[1], descr[3], K);
+}
+
+/* top save, CUDA version */
+#ifdef STARPU_USE_CUDA
+static void dummy_func_top_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	top_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Top block %d\n", block->bz);
+
+	/* The offset along the z axis is (block_size_z + K)- K */
+	unsigned block_size_z = get_block_size(block->bz);
+
+	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
+	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
+}
+
+/* bottom save, CUDA version */
+static void dummy_func_bottom_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+	unsigned z;
+	starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	bottom_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz);
+
+	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
+	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
+}
+#endif /* STARPU_USE_CUDA */
+
+/* top save, OpenCL version */
+#ifdef STARPU_USE_OPENCL
+static void dummy_func_top_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	top_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Top block %d\n", block->bz);
+
+	/* The offset along the z axis is (block_size_z + K)- K */
+	unsigned block_size_z = get_block_size(block->bz);
+
+	load_subblock_into_buffer_opencl(descr[0], descr[2], block_size_z);
+	load_subblock_into_buffer_opencl(descr[1], descr[3], block_size_z);
+}
+
+/* bottom save, OPENCL version */
+static void dummy_func_bottom_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned z;
+    starpu_codelet_unpack_args(arg, &z);
+	struct block_description *block = get_block_description(z);
+
+	int workerid = starpu_worker_get_id_check();
+	bottom_per_worker[workerid]++;
+
+	DEBUG( "DO SAVE Bottom block %d on OPENCL\n", block->bz);
+
+	load_subblock_into_buffer_opencl(descr[0], descr[2], K);
+	load_subblock_into_buffer_opencl(descr[1], descr[3], K);
+}
+#endif /* STARPU_USE_OPENCL */
+
+/* Performance models and codelet for save */
+static struct starpu_perfmodel save_cl_bottom_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "save_cl_bottom" 
+};
+
+static struct starpu_perfmodel save_cl_top_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "save_cl_top" 
+};
+
+struct starpu_codelet save_cl_bottom =
+{
+	.cpu_funcs = {dummy_func_bottom_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {dummy_func_bottom_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {dummy_func_bottom_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.model = &save_cl_bottom_model,
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
+};
+
+struct starpu_codelet save_cl_top =
+{
+	.cpu_funcs = {dummy_func_top_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {dummy_func_top_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {dummy_func_top_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.model = &save_cl_top_model,
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
+};
+
+/* Memset a block's buffers */
+static void memset_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned sizex, sizey, bz;
+    starpu_codelet_unpack_args(arg, &sizex, &sizey, &bz);
+	struct block_description *block = get_block_description(bz);
+    unsigned size_bz = get_block_size(bz);
+
+    unsigned x,y,z;
+    for (x = 0; x < sizex + 2*K; x++)
+    {
+        for (y = 0; y < sizey + 2*K; y++)
+        {
+            /* Main blocks */
+            for (z = 0; z < size_bz + 2*K; z++)
+            {
+                block->layers[0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+                block->layers[1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+            }
+            for (z = 0; z < K; z++)
+            {
+                /* Boundary blocks : Top */
+                block->boundaries[T][0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+                block->boundaries[T][1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+
+                /* Boundary blocks : Bottom */
+                block->boundaries[B][0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+                block->boundaries[B][1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0; 
+            }
+
+        }
+    }
+    //memset(block->layers[0], 0, (sizex + 2*K)*(sizey + 2*K)*(size_bz + 2*K)*sizeof(block->layers[0]));
+    //memset(block->layers[1], 0, (sizex + 2*K)*(sizey + 2*K)*(size_bz + 2*K)*sizeof(block->layers[1]));
+
+    //memset(block->boundaries[T][0], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[T][0]));
+    //memset(block->boundaries[T][1], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[T][1]));
+
+    //memset(block->boundaries[B][0], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[B][0]));
+    //memset(block->boundaries[B][1], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[B][1]));
+}
+
+static double memset_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
+}
+
+static struct starpu_perfmodel memset_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = memset_cost_function,
+	.symbol = "memset"
+};
+
+struct starpu_codelet cl_memset =
+{
+	.cpu_funcs = {memset_func},
+	.model = &memset_model,
+	.nbuffers = 6,
+	.modes = {STARPU_W, STARPU_W, STARPU_W, STARPU_W, STARPU_W, STARPU_W}
+};
+
+/* Initialize a block's layer */
+static void initlayer_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+    unsigned sizex, sizey, bz;
+    starpu_codelet_unpack_args(arg, &sizex, &sizey, &bz);
+	struct block_description *block = get_block_description(bz);
+    unsigned size_bz = get_block_size(bz);
+
+    /* Initialize layer with some random data */
+    unsigned x, y, z;
+    unsigned sum = 0;
+    for (x = 0; x < sizex; x++)
+        for (y = 0; y < sizey; y++)
+            for (z = 0; z < size_bz; z++)
+                sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)] = (int)((x/7.+y/13.+(bz*size_bz + z)/17.) * 10.) % 2;
+}
+
+static double initlayer_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
+}
+
+static struct starpu_perfmodel initlayer_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = initlayer_cost_function,
+	.symbol = "initlayer"
+};
+
+struct starpu_codelet cl_initlayer =
+{
+	.cpu_funcs = {initlayer_func},
+	.model = &initlayer_model,
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+

+ 204 - 0
examples/stencil/implicit-stencil-tasks.c

@@ -0,0 +1,204 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013-2015  Université de Bordeaux
+ * Copyright (C) 2012, 2013, 2015, 2016  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "implicit-stencil.h"
+
+#define BIND_LAST 1
+
+/*
+ * Schedule tasks for updates and saves
+ */
+
+/*
+ * NB: iter = 0: initialization phase, TAG_U(z, 0) = TAG_INIT
+ *
+ * dir is -1 or +1.
+ */
+
+#if 0
+# define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
+#else
+# define DEBUG(fmt, ...)
+#endif
+
+#ifdef STARPU_USE_MPI
+#include <starpu_mpi.h>
+#define starpu_insert_task(...) starpu_mpi_insert_task(MPI_COMM_WORLD, __VA_ARGS__)
+#endif
+
+/*
+ * Schedule initialization tasks
+ */
+
+void create_task_memset(unsigned sizex, unsigned sizey, unsigned z)
+{
+	struct block_description *descr = get_block_description(z);
+    struct starpu_codelet *codelet = &cl_memset;
+
+    int ret = starpu_insert_task(
+            codelet,
+            STARPU_VALUE,   &sizex,  sizeof(unsigned),
+            STARPU_VALUE,   &sizey,  sizeof(unsigned),
+            STARPU_VALUE,   &z,  sizeof(unsigned),
+            STARPU_W,   descr->layers_handle[0],
+            STARPU_W,   descr->layers_handle[1],
+            STARPU_W,   descr->boundaries_handle[T][0],
+            STARPU_W,   descr->boundaries_handle[T][1],
+            STARPU_W,   descr->boundaries_handle[B][0],
+            STARPU_W,   descr->boundaries_handle[B][1],
+                0);
+
+    if (ret)
+    {
+        FPRINTF(stderr, "Could not submit task save: %d\n", ret);
+        if (ret == -ENODEV)
+            exit(77);
+        STARPU_ABORT();
+    }
+}
+
+void create_task_initlayer(unsigned sizex, unsigned sizey, unsigned z)
+{
+	struct block_description *descr = get_block_description(z);
+    struct starpu_codelet *codelet = &cl_initlayer;
+
+    int ret = starpu_insert_task(
+            codelet,
+            STARPU_VALUE,   &sizex,  sizeof(unsigned),
+            STARPU_VALUE,   &sizey,  sizeof(unsigned),
+            STARPU_VALUE,   &z,  sizeof(unsigned),
+            STARPU_W,   descr->layers_handle[0],
+                0);
+
+    if (ret)
+    {
+        FPRINTF(stderr, "Could not submit task save: %d\n", ret);
+        if (ret == -ENODEV)
+            exit(77);
+        STARPU_ABORT();
+    }
+}
+
+/*
+ * Schedule saving boundaries of blocks to communication buffers
+ */
+
+static void create_task_save_local(unsigned iter, unsigned z, int dir, int local_rank)
+{
+	struct block_description *descr = get_block_description(z);
+	struct starpu_codelet *codelet;
+	int ret;
+
+	codelet = (dir == -1)?&save_cl_bottom:&save_cl_top;
+	ret = starpu_insert_task(
+				 codelet,
+				 STARPU_VALUE,   &z,  sizeof(unsigned),
+				 STARPU_R,   descr->layers_handle[0],
+				 STARPU_R,   descr->layers_handle[1],
+				 STARPU_W,   descr->boundaries_handle[(1-dir)/2][0],
+				 STARPU_W,   descr->boundaries_handle[(1-dir)/2][1],
+				 STARPU_PRIORITY,    STARPU_MAX_PRIO,
+				 0);
+
+	if (ret)
+	{
+		FPRINTF(stderr, "Could not submit task save: %d\n", ret);
+		if (ret == -ENODEV)
+			exit(77);
+		STARPU_ABORT();
+	}
+}
+
+/*
+ * Schedule update computation in computation buffer
+ */
+
+void create_task_update(unsigned iter, unsigned z, int local_rank)
+{
+	STARPU_ASSERT(iter != 0);
+
+	unsigned old_layer = (K*(iter-1)) % 2;
+	unsigned new_layer = (old_layer + 1) % 2;
+
+	struct block_description *descr = get_block_description(z);
+	struct block_description *bottom_neighbour = descr->boundary_blocks[B];
+	struct block_description *top_neighbour = descr->boundary_blocks[T];
+
+	struct starpu_codelet *codelet = &cl_update;
+
+    // Simple-level prio
+    //int prio = ((bottom_neighbour->mpi_node != local_rank) || (top_neighbour->mpi_node != local_rank )) ? STARPU_MAX_PRIO : STARPU_DEFAULT_PRIO;
+
+    // Two-level prio
+    int prio = ((bottom_neighbour->mpi_node != local_rank) || (top_neighbour->mpi_node != local_rank )) ? STARPU_MAX_PRIO :
+               ((bottom_neighbour->boundary_blocks[B]->mpi_node != local_rank) || (top_neighbour->boundary_blocks[T]->mpi_node != local_rank )) ? STARPU_MAX_PRIO-1 : STARPU_DEFAULT_PRIO;
+
+    int ret = starpu_insert_task(
+            codelet,
+            STARPU_VALUE,   &z,  sizeof(unsigned),
+	        STARPU_RW,      descr->layers_handle[old_layer],
+	        STARPU_RW,      descr->layers_handle[new_layer],
+	        STARPU_R,       bottom_neighbour->boundaries_handle[T][old_layer],
+	        STARPU_R,       bottom_neighbour->boundaries_handle[T][new_layer],
+	        STARPU_R,       top_neighbour->boundaries_handle[B][old_layer],
+	        STARPU_R,       top_neighbour->boundaries_handle[B][new_layer],
+            STARPU_PRIORITY,    prio,
+                0);
+	if (ret)
+	{
+		FPRINTF(stderr, "Could not submit task update block: %d\n", ret);
+		if (ret == -ENODEV)
+			exit(77);
+		STARPU_ABORT();
+	}
+}
+
+/*
+ * Create all the tasks
+ */
+void create_tasks(int rank)
+{
+	int iter;
+	int bz;
+	int niter = get_niter();
+	int nbz = get_nbz();
+
+	for (iter = 0; iter <= niter; iter++)
+	{
+	     for (bz = 0; bz < nbz; bz++)
+	     {
+		    if ((iter > 0) && ((get_block_mpi_node(bz) == rank)|| (get_block_mpi_node(bz+1) == rank)|| (get_block_mpi_node(bz-1) == rank)))
+			    create_task_update(iter, bz, rank);
+	     }
+
+	     for (bz = 0; bz < nbz; bz++)
+	     {
+		     if (iter != niter)
+		     {
+			     int node_z = get_block_mpi_node(bz);
+			     int node_z_and_b = get_block_mpi_node(bz-1);
+			     int node_z_and_t = get_block_mpi_node(bz+1);
+
+			     if ((node_z == rank) || ((node_z != node_z_and_b) && (node_z_and_b == rank)))
+				     create_task_save_local(iter, bz, +1, rank);
+
+			     if ((node_z == rank) || ((node_z != node_z_and_t) && (node_z_and_t == rank)))
+				     create_task_save_local(iter, bz, -1, rank);
+		     }
+	     }
+	}
+}

+ 387 - 0
examples/stencil/implicit-stencil.c

@@ -0,0 +1,387 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "implicit-stencil.h"
+
+/* Main application */
+
+/* default parameter values */
+static unsigned  bind_tasks = 0;
+
+static unsigned ticks = 1000;
+
+#ifdef STARPU_QUICK_CHECK
+static unsigned niter = 4;
+#define SIZE 16
+#else
+static unsigned niter = 32;
+#define SIZE 128
+#endif
+
+/* Problem size */
+static unsigned sizex = SIZE;
+static unsigned sizey = SIZE;
+static unsigned sizez = 64*SIZE;
+
+/* Number of blocks (scattered over the different MPI processes) */
+unsigned nbz = 64;
+
+/* StarPU top variables */
+struct starpu_top_data* starpu_top_init_loop;
+struct starpu_top_data* starpu_top_achieved_loop;
+
+double start;
+double begin, end;
+double timing; 
+
+/*
+ *	Initialization
+ */
+
+unsigned get_bind_tasks(void)
+{
+	return bind_tasks;
+}
+
+unsigned get_nbz(void)
+{
+	return nbz;
+}
+
+unsigned get_niter(void)
+{
+	return niter;
+}
+
+unsigned get_ticks(void)
+{
+	return ticks;
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-b") == 0)
+		{
+			bind_tasks = 1;
+		}
+
+		if (strcmp(argv[i], "-nbz") == 0)
+		{
+			nbz = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-sizex") == 0)
+		{
+			sizex = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-sizey") == 0)
+		{
+			sizey = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-sizez") == 0)
+		{
+			sizez = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-niter") == 0)
+		{
+			niter = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-ticks") == 0)
+		{
+			ticks = atoi(argv[++i]);
+		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			 fprintf(stderr, "Usage : %s [options...]\n", argv[0]);
+			 fprintf(stderr, "\n");
+			 fprintf(stderr, "Options:\n");
+			 fprintf(stderr, "-b			bind tasks on CPUs/GPUs\n");
+			 fprintf(stderr, "-nbz <n>		Number of blocks on Z axis (%u by default)\n", nbz);
+			 fprintf(stderr, "-size[xyz] <size>	Domain size on x/y/z axis (%ux%ux%u by default)\n", sizex, sizey, sizez);
+			 fprintf(stderr, "-niter <n>		Number of iterations (%u by default)\n", niter);
+			 fprintf(stderr, "-ticks <t>		How often to put ticks in the output (ms, %u by default)\n", ticks);
+			 exit(0);
+		}
+	}
+}
+
+static void init_problem(int argc, char **argv, int rank, int world_size)
+{
+	parse_args(argc, argv);
+
+	if (getenv("STARPU_TOP"))
+	{
+		starpu_top_init_loop = starpu_top_add_data_integer("Task creation iter", 0, niter, 1);
+		starpu_top_achieved_loop = starpu_top_add_data_integer("Task achieved iter", 0, niter, 1);
+		starpu_top_init_and_wait("stencil_top example");
+	}
+	create_blocks_array(sizex, sizey, sizez, nbz);
+
+	/* Select the MPI process which should compute the different blocks */
+	assign_blocks_to_mpi_nodes(world_size);
+
+	assign_blocks_to_workers(rank);
+
+	/* Allocate the different memory blocks, if used by the MPI process */
+	start = starpu_timing_now();
+
+	allocate_memory_on_node(rank);
+
+	end = starpu_timing_now();
+	timing = end - begin;
+
+	display_memory_consumption(rank, timing);
+
+	who_runs_what_len = 2*niter;
+	who_runs_what = (int *) calloc(nbz * who_runs_what_len, sizeof(*who_runs_what));
+	who_runs_what_index = (int *) calloc(nbz, sizeof(*who_runs_what_index));
+	last_tick = (double *) calloc(nbz, sizeof(*last_tick));
+}
+
+static void free_problem(int rank)
+{
+    free_memory_on_node(rank);
+	free_blocks_array();
+	free(who_runs_what);
+	free(who_runs_what_index);
+	free(last_tick);
+}
+
+/*
+ *	Main body
+ */
+
+void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
+{
+	unsigned total = 0;
+	int worker;
+
+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
+		total += task_per_worker[worker];
+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
+	{
+		if (task_per_worker[worker])
+		{
+			char name[32];
+			starpu_worker_get_name(worker, name, sizeof(name));
+			FPRINTF(stderr,"\t%s -> %u (%2.2f%%)\n", name, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
+		}
+	}
+}
+
+unsigned global_workerid(unsigned local_workerid)
+{
+#ifdef STARPU_USE_MPI
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	unsigned workers_per_node = starpu_worker_get_count();
+
+	return (local_workerid + rank*workers_per_node);
+#else
+	return local_workerid;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+	int ret;
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int thread_support;
+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support))
+	{
+		FPRINTF(stderr, "MPI_Init_thread failed\n");
+	}
+	if (thread_support == MPI_THREAD_FUNNELED)
+		FPRINTF(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+	if (thread_support < MPI_THREAD_FUNNELED)
+		FPRINTF(stderr,"Warning: MPI does not have thread support!\n");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+#else
+	rank = 0;
+	world_size = 1;
+#endif
+
+	if (rank == 0)
+	{
+		FPRINTF(stderr, "Running on %d nodes\n", world_size);
+		fflush(stderr);
+	}
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+#endif
+
+#ifdef STARPU_USE_OPENCL
+        opencl_life_init();
+        opencl_shadow_init();
+#endif /*STARPU_USE_OPENCL*/
+
+	init_problem(argc, argv, rank, world_size);
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+#endif
+	if (rank == 0)
+		FPRINTF(stderr, "GO !\n");
+
+	start = starpu_timing_now();
+
+	begin = starpu_timing_now();
+
+	create_tasks(rank);
+
+	//starpu_tag_notify_from_apps(TAG_INIT_TASK);
+
+	//wait_end_tasks(rank);
+
+    starpu_task_wait_for_all();
+
+	end = starpu_timing_now();
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+#endif
+
+#if 0
+	check(rank);
+#endif
+
+	/*display_debug(nbz, niter, rank);*/
+
+	/* timing in us */
+	timing = end - begin;
+
+	double min_timing = timing;
+	double max_timing = timing;
+	double sum_timing = timing;
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	int reduce_ret;
+
+	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	/* XXX we should do a gather instead, here we assume that non initialized values are still 0 */
+	int *who_runs_what_tmp = malloc(nbz * who_runs_what_len * sizeof(*who_runs_what));
+	reduce_ret = MPI_Reduce(who_runs_what, who_runs_what_tmp, nbz * who_runs_what_len, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	memcpy(who_runs_what, who_runs_what_tmp, nbz * who_runs_what_len * sizeof(*who_runs_what));
+	free(who_runs_what_tmp);
+
+	/* XXX we should do a gather instead, here we assume that non initialized values are still 0 */
+	int *who_runs_what_index_tmp = malloc(nbz * sizeof(*who_runs_what_index));
+	reduce_ret = MPI_Reduce(who_runs_what_index, who_runs_what_index_tmp, nbz, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	memcpy(who_runs_what_index, who_runs_what_index_tmp, nbz * sizeof(*who_runs_what_index));
+	free(who_runs_what_index_tmp);
+#endif
+
+	if (rank == 0)
+	{
+#if 1 
+		FPRINTF(stderr, "update:\n");
+		f(update_per_worker);
+		FPRINTF(stderr, "top:\n");
+		f(top_per_worker);
+		FPRINTF(stderr, "bottom:\n");
+		f(bottom_per_worker);
+#endif
+#if 1
+		unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
+
+		int iter;
+		for (iter = 0; iter < who_runs_what_len; iter++)
+		{
+			unsigned last, bz;
+			last = 1;
+			for (bz = 0; bz < nbz; bz++)
+			{
+				if ((bz % nzblocks_per_process) == 0)
+					FPRINTF(stderr, "| ");
+
+				if (who_runs_what_index[bz] <= iter)
+					FPRINTF(stderr,"_ ");
+				else
+				{
+					last = 0;
+					if (who_runs_what[bz + iter * nbz] == -1)
+						FPRINTF(stderr,"* ");
+					else
+						FPRINTF(stderr, "%d ", who_runs_what[bz + iter * nbz]);
+				}
+			}
+			FPRINTF(stderr, "\n");
+
+			if (last)
+				break;
+		}
+#endif
+
+		fflush(stderr);
+
+		FPRINTF(stdout, "Computation took: %f ms on %d MPI processes\n", max_timing/1000, world_size);
+		FPRINTF(stdout, "\tMIN : %f ms\n", min_timing/1000);
+		FPRINTF(stdout, "\tMAX : %f ms\n", max_timing/1000);
+		FPRINTF(stdout, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
+	}
+
+	free_problem(rank);
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	starpu_mpi_shutdown();
+#endif
+
+	starpu_shutdown();
+
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+	MPI_Finalize();
+#endif
+
+#ifdef STARPU_USE_OPENCL
+        opencl_life_free();
+        opencl_shadow_free();
+#endif /*STARPU_USE_OPENCL*/
+
+	return 0;
+}

+ 157 - 0
examples/stencil/implicit-stencil.h

@@ -0,0 +1,157 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010-2011, 2014  Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __IMPLICIT_STENCIL_H__
+#define __IMPLICIT_STENCIL_H__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <starpu.h>
+
+#ifndef __CUDACC__
+#ifdef STARPU_USE_MPI
+#include <mpi.h>
+#include <starpu_mpi.h>
+#endif
+#endif
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+#define LIFE
+
+#ifdef LIFE
+#define TYPE	unsigned char
+extern void life_update(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
+#else
+#define TYPE	float
+#endif
+
+#define K	1
+
+#define NDIRS 2
+extern struct starpu_top_data* starpu_top_init_loop;
+extern struct starpu_top_data* starpu_top_achieved_loop;
+
+
+/* Split only on the z axis to make things simple */
+typedef enum
+{
+	B = 0,
+	T = 1
+} direction;
+
+/* Description of a domain block */
+struct block_description
+{
+	/* Which MPI node should process that block ? */
+	int mpi_node;
+
+	unsigned preferred_worker;
+
+	unsigned bz;
+
+
+	/* For each of the following buffers, there are two (0/1) buffers to
+	 * make new/old switch costless. */
+
+	/* This is the computation buffer for this block, it includes
+	 * neighbours' border to make computation easier */
+	TYPE *layers[2];
+	starpu_data_handle_t layers_handle[2];
+
+	/* This is the "save" buffer, i.e. a copy of our neighbour's border.
+	 * This one is used for CPU/GPU or MPI communication (rather than the
+	 * whole domain block) */
+	TYPE *boundaries[NDIRS][2];
+	starpu_data_handle_t boundaries_handle[NDIRS][2];
+
+	/* Shortcut pointer to the neighbours */
+	struct block_description *boundary_blocks[NDIRS];
+};
+
+#define TAG_INIT_TASK			((starpu_tag_t)1)
+
+starpu_tag_t TAG_FINISH(int z);
+starpu_tag_t TAG_START(int z, int dir);
+int MPI_TAG0(int z, int iter, int dir);
+int MPI_TAG1(int z, int iter, int dir);
+
+#define MIN(a,b)	((a)<(b)?(a):(b))
+
+void create_blocks_array(unsigned sizex, unsigned sizey, unsigned sizez, unsigned nbz);
+void free_blocks_array();
+struct block_description *get_block_description(int z);
+void assign_blocks_to_mpi_nodes(int world_size);
+void allocate_memory_on_node(int rank);
+void assign_blocks_to_workers(int rank);
+void create_tasks(int rank);
+void wait_end_tasks(int rank);
+void check(int rank);
+void free_memory_on_node(int rank);
+
+void display_memory_consumption(int rank, double time);
+
+int get_block_mpi_node(int z);
+unsigned get_block_size(int z);
+unsigned get_bind_tasks(void);
+
+unsigned get_nbz(void);
+unsigned get_niter(void);
+unsigned get_ticks(void);
+
+unsigned global_workerid(unsigned local_workerid);
+
+void create_task_memset(unsigned sizex, unsigned sizey, unsigned z);
+void create_task_initlayer(unsigned sizex, unsigned sizey, unsigned z);
+void create_task_update(unsigned iter, unsigned z, int local_rank);
+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank);
+
+extern int starpu_mpi_initialize(void);
+extern int starpu_mpi_shutdown(void);
+
+/* kernels */
+extern struct starpu_codelet cl_update;
+extern struct starpu_codelet save_cl_bottom;
+extern struct starpu_codelet save_cl_top;
+extern struct starpu_codelet cl_memset;
+extern struct starpu_codelet cl_initlayer;
+
+extern unsigned update_per_worker[STARPU_NMAXWORKERS];
+extern unsigned top_per_worker[STARPU_NMAXWORKERS];
+extern unsigned bottom_per_worker[STARPU_NMAXWORKERS];
+
+extern double start;
+extern int who_runs_what_len;
+extern int *who_runs_what;
+extern int *who_runs_what_index;
+extern double *last_tick;
+
+#ifndef _externC
+#define _externC
+#endif
+
+_externC void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
+_externC void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i);
+
+_externC void opencl_shadow_init(void);
+_externC void opencl_shadow_free(void);
+_externC void opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i);
+_externC void opencl_life_init(void);
+_externC void opencl_life_free(void);
+_externC void opencl_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter);
+
+#endif /* __IMPLICIT_STENCIL_H__ */

+ 1 - 1
examples/stencil/stencil-blocks.c

@@ -283,7 +283,7 @@ static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsi
 
 void display_memory_consumption(int rank)
 {
-	FPRINTF(stderr, "%lu B of memory were allocated on node %d\n", allocated, rank);
+	FPRINTF(stderr, "%lu B of memory were allocated on node %d\n", (unsigned long) allocated, rank);
 }
 
 void allocate_memory_on_node(int rank)

+ 5 - 5
examples/worker_collections/worker_tree_example.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2016  Université de Bordeaux
- * Copyright (C) 2010-2015  CNRS
+ * Copyright (C) 2010-2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,7 +54,7 @@ int main()
 	co->init_iterator = worker_tree.init_iterator;
 	co->type = STARPU_WORKER_TREE;
 
-	FPRINTF(stderr, "ncpus %d \n", ncpus);
+	FPRINTF(stderr, "ncpus %u \n", ncpus);
 
 	double start_time;
 	double end_time;
@@ -81,7 +81,7 @@ int main()
 	while(co->has_next(co, &it))
 	{
 		pu = co->get_next(co, &it);
-		FPRINTF(stderr, "pu = %d out of %d workers \n", pu, co->nworkers);
+		FPRINTF(stderr, "pu = %d out of %u workers \n", pu, co->nworkers);
 	}
 
 	unsigned six = 6;
@@ -90,13 +90,13 @@ int main()
 	for(i = 0; i < six; i++)
 	{
 		co->remove(co, i);
-		FPRINTF(stderr, "remove %d out of %d workers\n", i, co->nworkers);
+		FPRINTF(stderr, "remove %u out of %u workers\n", i, co->nworkers);
 	}
 
 	while(co->has_next(co, &it))
 	{
 		pu = co->get_next(co, &it);
-		FPRINTF(stderr, "pu = %d out of %d workers \n", pu, co->nworkers);
+		FPRINTF(stderr, "pu = %d out of %u workers \n", pu, co->nworkers);
 	}
 
 	FPRINTF(stderr, "timing init = %lf \n", timing);

+ 1 - 1
include/starpu.h

@@ -83,7 +83,7 @@ struct starpu_conf
 
 	const char *sched_policy_name;
 	struct starpu_sched_policy *sched_policy;
-	void (*sched_policy_init)(void);
+	void (*sched_policy_init)(unsigned);
 
 	int ncpus;
 	int ncuda;

+ 2 - 0
include/starpu_config.h.in

@@ -145,4 +145,6 @@ typedef ssize_t starpu_ssize_t;
 
 #undef STARPU_HAVE_DARWIN
 
+#undef STARPU_HAVE_CXX11
+
 #endif

+ 4 - 0
include/starpu_data.h

@@ -61,6 +61,8 @@ void starpu_data_invalidate_submit(starpu_data_handle_t handle);
 
 void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important);
 
+#define STARPU_ACQUIRE_NO_NODE -1
+#define STARPU_ACQUIRE_ALL_NODES -2
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode);
 int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
@@ -143,6 +145,8 @@ unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, unsi
 
 void starpu_memchunk_tidy(unsigned memory_node);
 
+void starpu_data_set_user_data(starpu_data_handle_t handle, void* user_data);
+void *starpu_data_get_user_data(starpu_data_handle_t handle);
 
 #ifdef __cplusplus
 }

+ 17 - 1
include/starpu_perfmodel.h

@@ -3,6 +3,7 @@
  * Copyright (C) 2010-2014, 2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2016  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -60,6 +61,10 @@ struct starpu_perfmodel_history_entry
 	uint32_t footprint;
 	size_t size;
 	double flops;
+
+	double duration;
+	starpu_tag_t tag;
+	double *parameters;
 };
 
 struct starpu_perfmodel_history_list
@@ -88,6 +93,10 @@ struct starpu_perfmodel_regression_model
 	unsigned nl_valid;
 
 	unsigned nsample;
+
+	double *coeff;
+	unsigned ncoeff;
+	unsigned multi_valid;
 };
 
 struct starpu_perfmodel_history_table;
@@ -116,7 +125,8 @@ enum starpu_perfmodel_type
 	STARPU_COMMON,
 	STARPU_HISTORY_BASED,
 	STARPU_REGRESSION_BASED,
-	STARPU_NL_REGRESSION_BASED
+	STARPU_NL_REGRESSION_BASED,
+	STARPU_MULTIPLE_REGRESSION_BASED
 };
 
 struct _starpu_perfmodel_state;
@@ -138,6 +148,12 @@ struct starpu_perfmodel
 	unsigned benchmarking;
 	unsigned is_init;
 
+	void (*parameters)(struct starpu_task * task, double *parameters);
+	const char **parameters_names;
+	unsigned nparameters;
+	unsigned **combinations;
+	unsigned ncombinations;
+
 	starpu_perfmodel_state_t state;
 };
 

+ 4 - 0
include/starpu_profiling.h

@@ -108,6 +108,10 @@ int starpu_bus_get_count(void);
 int starpu_bus_get_id(int src, int dst);
 int starpu_bus_get_src(int busid);
 int starpu_bus_get_dst(int busid);
+void starpu_bus_set_direct(int busid, int direct);
+int starpu_bus_get_direct(int busid);
+void starpu_bus_set_ngpus(int busid, int ngpus);
+int starpu_bus_get_ngpus(int busid);
 
 int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info);
 

+ 3 - 3
include/starpu_util.h

@@ -112,9 +112,9 @@ extern "C"
 #endif
 
 #ifdef STARPU_NO_ASSERT
-#define STARPU_ASSERT(x)		do { } while(0)
-#define STARPU_ASSERT_ACCESSIBLE(x)	do { } while(0)
-#define STARPU_ASSERT_MSG(x, msg, ...)	do { } while(0)
+#define STARPU_ASSERT(x)		do { if (0) { (void) (x); } } while(0)
+#define STARPU_ASSERT_ACCESSIBLE(x)	do { if (0) { (void) (x); } } while(0)
+#define STARPU_ASSERT_MSG(x, msg, ...)	do { if (0) { (void) (x); (void) msg; } } while(0)
 #else
 #  if defined(__CUDACC__) || defined(STARPU_HAVE_WINDOWS)
 #    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); *(int*)NULL = 0; } } while(0)

+ 562 - 0
m4/ax_cxx_compile_stdcxx.m4

@@ -0,0 +1,562 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional])
+#
+# DESCRIPTION
+#
+#   Check for baseline language coverage in the compiler for the specified
+#   version of the C++ standard.  If necessary, add switches to CXX and
+#   CXXCPP to enable support.  VERSION may be '11' (for the C++11 standard)
+#   or '14' (for the C++14 standard).
+#
+#   The second argument, if specified, indicates whether you insist on an
+#   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
+#   -std=c++11).  If neither is specified, you get whatever works, with
+#   preference for an extended mode.
+#
+#   The third argument, if specified 'mandatory' or if left unspecified,
+#   indicates that baseline support for the specified C++ standard is
+#   required and that the macro should error out if no mode with that
+#   support is found.  If specified 'optional', then configuration proceeds
+#   regardless, after defining HAVE_CXX${VERSION} if and only if a
+#   supporting mode is found.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
+#   Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
+#   Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
+#   Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
+#   Copyright (c) 2015 Paul Norman <penorman@mac.com>
+#   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 4
+
+dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
+dnl  (serial version number 13).
+
+AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
+  m4_if([$1], [11], [],
+        [$1], [14], [],
+        [$1], [17], [m4_fatal([support for C++17 not yet implemented in AX_CXX_COMPILE_STDCXX])],
+        [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
+  m4_if([$2], [], [],
+        [$2], [ext], [],
+        [$2], [noext], [],
+        [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl
+  m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true],
+        [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true],
+        [$3], [optional], [ax_cxx_compile_cxx$1_required=false],
+        [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])])
+  AC_LANG_PUSH([C++])dnl
+  ac_success=no
+  AC_CACHE_CHECK(whether $CXX supports C++$1 features by default,
+  ax_cv_cxx_compile_cxx$1,
+  [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+    [ax_cv_cxx_compile_cxx$1=yes],
+    [ax_cv_cxx_compile_cxx$1=no])])
+  if test x$ax_cv_cxx_compile_cxx$1 = xyes; then
+    ac_success=yes
+  fi
+
+  m4_if([$2], [noext], [], [dnl
+  if test x$ac_success = xno; then
+    for switch in -std=gnu++$1 -std=gnu++0x; do
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
+                     $cachevar,
+        [ac_save_CXX="$CXX"
+         CXX="$CXX $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXX="$ac_save_CXX"])
+      if eval test x\$$cachevar = xyes; then
+        CXX="$CXX $switch"
+        if test -n "$CXXCPP" ; then
+          CXXCPP="$CXXCPP $switch"
+        fi
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+
+  m4_if([$2], [ext], [], [dnl
+  if test x$ac_success = xno; then
+    dnl HP's aCC needs +std=c++11 according to:
+    dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
+    dnl Cray's crayCC needs "-h std=c++11"
+    for switch in -std=c++$1 -std=c++0x +std=c++$1 "-h std=c++$1"; do
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
+                     $cachevar,
+        [ac_save_CXX="$CXX"
+         CXX="$CXX $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXX="$ac_save_CXX"])
+      if eval test x\$$cachevar = xyes; then
+        CXX="$CXX $switch"
+        if test -n "$CXXCPP" ; then
+          CXXCPP="$CXXCPP $switch"
+        fi
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+  AC_LANG_POP([C++])
+  if test x$ax_cxx_compile_cxx$1_required = xtrue; then
+    if test x$ac_success = xno; then
+      AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.])
+    fi
+  fi
+  if test x$ac_success = xno; then
+    HAVE_CXX$1=0
+    AC_MSG_NOTICE([No compiler with C++$1 support was found])
+  else
+    HAVE_CXX$1=1
+    AC_DEFINE(HAVE_CXX$1,1,
+              [define if the compiler supports basic C++$1 syntax])
+  fi
+  AC_SUBST(HAVE_CXX$1)
+])
+
+
+dnl  Test body for checking C++11 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+)
+
+
+dnl  Test body for checking C++14 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+)
+
+
+dnl  Tests for new features in C++11
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+  namespace test_static_assert
+  {
+
+    template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+  }
+
+  namespace test_final_override
+  {
+
+    struct Base
+    {
+      virtual void f() {}
+    };
+
+    struct Derived : public Base
+    {
+      virtual void f() override {}
+    };
+
+  }
+
+  namespace test_double_right_angle_brackets
+  {
+
+    template < typename T >
+    struct check {};
+
+    typedef check<void> single_type;
+    typedef check<check<void>> double_type;
+    typedef check<check<check<void>>> triple_type;
+    typedef check<check<check<check<void>>>> quadruple_type;
+
+  }
+
+  namespace test_decltype
+  {
+
+    int
+    f()
+    {
+      int a = 1;
+      decltype(a) b = 2;
+      return a + b;
+    }
+
+  }
+
+  namespace test_type_deduction
+  {
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static const bool value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static const bool value = true;
+    };
+
+    template < typename T1, typename T2 >
+    auto
+    add(T1 a1, T2 a2) -> decltype(a1 + a2)
+    {
+      return a1 + a2;
+    }
+
+    int
+    test(const int c, volatile int v)
+    {
+      static_assert(is_same<int, decltype(0)>::value == true, "");
+      static_assert(is_same<int, decltype(c)>::value == false, "");
+      static_assert(is_same<int, decltype(v)>::value == false, "");
+      auto ac = c;
+      auto av = v;
+      auto sumi = ac + av + 'x';
+      auto sumf = ac + av + 1.0;
+      static_assert(is_same<int, decltype(ac)>::value == true, "");
+      static_assert(is_same<int, decltype(av)>::value == true, "");
+      static_assert(is_same<int, decltype(sumi)>::value == true, "");
+      static_assert(is_same<int, decltype(sumf)>::value == false, "");
+      static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+      return (sumf > 0.0) ? sumi : add(c, v);
+    }
+
+  }
+
+  namespace test_noexcept
+  {
+
+    int f() { return 0; }
+    int g() noexcept { return 0; }
+
+    static_assert(noexcept(f()) == false, "");
+    static_assert(noexcept(g()) == true, "");
+
+  }
+
+  namespace test_constexpr
+  {
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+    {
+      return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+    }
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c(const CharT *const s) noexcept
+    {
+      return strlen_c_r(s, 0UL);
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("1") == 1UL, "");
+    static_assert(strlen_c("example") == 7UL, "");
+    static_assert(strlen_c("another\0example") == 7UL, "");
+
+  }
+
+  namespace test_rvalue_references
+  {
+
+    template < int N >
+    struct answer
+    {
+      static constexpr int value = N;
+    };
+
+    answer<1> f(int&)       { return answer<1>(); }
+    answer<2> f(const int&) { return answer<2>(); }
+    answer<3> f(int&&)      { return answer<3>(); }
+
+    void
+    test()
+    {
+      int i = 0;
+      const int c = 0;
+      static_assert(decltype(f(i))::value == 1, "");
+      static_assert(decltype(f(c))::value == 2, "");
+      static_assert(decltype(f(0))::value == 3, "");
+    }
+
+  }
+
+  namespace test_uniform_initialization
+  {
+
+    struct test
+    {
+      static const int zero {};
+      static const int one {1};
+    };
+
+    static_assert(test::zero == 0, "");
+    static_assert(test::one == 1, "");
+
+  }
+
+  namespace test_lambdas
+  {
+
+    void
+    test1()
+    {
+      auto lambda1 = [](){};
+      auto lambda2 = lambda1;
+      lambda1();
+      lambda2();
+    }
+
+    int
+    test2()
+    {
+      auto a = [](int i, int j){ return i + j; }(1, 2);
+      auto b = []() -> int { return '0'; }();
+      auto c = [=](){ return a + b; }();
+      auto d = [&](){ return c; }();
+      auto e = [a, &b](int x) mutable {
+        const auto identity = [](int y){ return y; };
+        for (auto i = 0; i < a; ++i)
+          a += b--;
+        return x + identity(a + b);
+      }(0);
+      return a + b + c + d + e;
+    }
+
+    int
+    test3()
+    {
+      const auto nullary = [](){ return 0; };
+      const auto unary = [](int x){ return x; };
+      using nullary_t = decltype(nullary);
+      using unary_t = decltype(unary);
+      const auto higher1st = [](nullary_t f){ return f(); };
+      const auto higher2nd = [unary](nullary_t f1){
+        return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+      };
+      return higher1st(nullary) + higher2nd(nullary)(unary);
+    }
+
+  }
+
+  namespace test_variadic_templates
+  {
+
+    template <int...>
+    struct sum;
+
+    template <int N0, int... N1toN>
+    struct sum<N0, N1toN...>
+    {
+      static constexpr auto value = N0 + sum<N1toN...>::value;
+    };
+
+    template <>
+    struct sum<>
+    {
+      static constexpr auto value = 0;
+    };
+
+    static_assert(sum<>::value == 0, "");
+    static_assert(sum<1>::value == 1, "");
+    static_assert(sum<23>::value == 23, "");
+    static_assert(sum<1, 2>::value == 3, "");
+    static_assert(sum<5, 5, 11>::value == 21, "");
+    static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+  }
+
+  // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+  // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+  // because of this.
+  namespace test_template_alias_sfinae
+  {
+
+    struct foo {};
+
+    template<typename T>
+    using member = typename T::member_type;
+
+    template<typename T>
+    void func(...) {}
+
+    template<typename T>
+    void func(member<T>*) {}
+
+    void test();
+
+    void test() { func<foo>(0); }
+
+  }
+
+}  // namespace cxx11
+
+#endif  // __cplusplus >= 201103L
+
+]])
+
+
+dnl  Tests for new features in C++14
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
+
+// If the compiler admits that it is not ready for C++14, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201402L
+
+#error "This is not a C++14 compiler"
+
+#else
+
+namespace cxx14
+{
+
+  namespace test_polymorphic_lambdas
+  {
+
+    int
+    test()
+    {
+      const auto lambda = [](auto&&... args){
+        const auto istiny = [](auto x){
+          return (sizeof(x) == 1UL) ? 1 : 0;
+        };
+        const int aretiny[] = { istiny(args)... };
+        return aretiny[0];
+      };
+      return lambda(1, 1L, 1.0f, '1');
+    }
+
+  }
+
+  namespace test_binary_literals
+  {
+
+    constexpr auto ivii = 0b0000000000101010;
+    static_assert(ivii == 42, "wrong value");
+
+  }
+
+  namespace test_generalized_constexpr
+  {
+
+    template < typename CharT >
+    constexpr unsigned long
+    strlen_c(const CharT *const s) noexcept
+    {
+      auto length = 0UL;
+      for (auto p = s; *p; ++p)
+        ++length;
+      return length;
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("x") == 1UL, "");
+    static_assert(strlen_c("test") == 4UL, "");
+    static_assert(strlen_c("another\0test") == 7UL, "");
+
+  }
+
+  namespace test_lambda_init_capture
+  {
+
+    int
+    test()
+    {
+      auto x = 0;
+      const auto lambda1 = [a = x](int b){ return a + b; };
+      const auto lambda2 = [a = lambda1(x)](){ return a; };
+      return lambda2();
+    }
+
+  }
+
+  namespace test_digit_seperators
+  {
+
+    constexpr auto ten_million = 100'000'000;
+    static_assert(ten_million == 100000000, "");
+
+  }
+
+  namespace test_return_type_deduction
+  {
+
+    auto f(int& x) { return x; }
+    decltype(auto) g(int& x) { return x; }
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static constexpr auto value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static constexpr auto value = true;
+    };
+
+    int
+    test()
+    {
+      auto x = 0;
+      static_assert(is_same<int, decltype(f(x))>::value, "");
+      static_assert(is_same<int&, decltype(g(x))>::value, "");
+      return x;
+    }
+
+  }
+
+}  // namespace cxx14
+
+#endif  // __cplusplus >= 201402L
+
+]])

+ 25 - 5
mic-configure

@@ -17,6 +17,7 @@ declare -a mic_params
 unset host_params
 unset mic_params
 native_mic=0
+mpi=0
 for arg in "$@"
 do
 	case $arg in
@@ -44,6 +45,9 @@ do
 		--with-host-param=*)
 			host_params+=("${arg#--with-host-param=}")
 			;;
+		--with-mpi*)
+			mpi=1
+			;;
 		--help)
 			cat << EOF
 mic-configure specific options:
@@ -97,8 +101,11 @@ then
 fi
 
 # prepend mic_params with "--with-mpicc=mpicc -mmic", to allow possible override by the user
-mic_params=("--with-mpicc=mpicc -mmic" "${mic_params[@]}")
-mic_params=("--with-mpifort=mpifort -mmic" "${mic_params[@]}")
+if [ $mpi = 1 ]
+then
+	mic_params=("--with-mpicc=mpicc -mmic" "${mic_params[@]}")
+	mic_params=("--with-mpifort=mpifort -mmic" "${mic_params[@]}")
+fi
 
 for arch in $dev_list #host mic
 do
@@ -110,6 +117,9 @@ do
 		*) command="../${ROOT_DIR}/configure";;
 	esac
 
+	declare -a params
+	params=("--prefix=$prefix/$arch" "--disable-fstack-protector-all")
+
 	if [ "$arch" = mic ] ; then
 		if [ $compiler = "icc" ] ; then
 		    export CC="icc -mmic"
@@ -130,9 +140,6 @@ do
 		fi
 	fi
 
-	declare -a params
-	params=("--prefix=$prefix/$arch" "--disable-fstack-protector-all")
-
 	if [ "$native_mic" -eq "0" ]
 	then
 		params+=(--enable-mic "--with-coi-dir=$coi_dir" "--with-scif-dir=$scif_dir")
@@ -164,6 +171,13 @@ do
 		CPLUS_INCLUDE_PATH=$SINK_CPLUS_INCLUDE_PATH \
 		PKG_CONFIG_PATH=$SINK_PKG_CONFIG_PATH \
 		$command "$@" "${params[@]}" "${mic_params[@]}"
+		MIC_BUILD_ENV="\
+LIBRARY_PATH=$SINK_LIBRARY_PATH \\
+	INCLUDE=$SINK_INCLUDE \\
+	C_INCLUDE_PATH=$SINK_C_INCLUDE_PATH \\
+	CPLUS_INCLUDE_PATH=$SINK_CPLUS_INCLUDE_PATH \\
+	PKG_CONFIG_PATH=$SINK_PKG_CONFIG_PATH \\\
+"
 	else
 		$command "$@" "${params[@]}""${host_params[@]}"
 	fi
@@ -177,6 +191,7 @@ if [ "$native_mic" -eq "1" ]
 then
 cat > Makefile << EOF
 all:
+	$MIC_BUILD_ENV
 	\$(MAKE) -C build_mic
 
 clean:
@@ -186,12 +201,14 @@ distclean: clean
 	rm -f Makefile
 
 check:
+	$MIC_BUILD_ENV
 	\$(MAKE) -C build_mic check
 
 showcheck:
 	\$(MAKE) -C build_mic showcheck
 
 install:
+	$MIC_BUILD_ENV
 	\$(MAKE) -C build_mic install
 	ln -sf "${prefix}/mic/lib/pkgconfig/starpu-1.3.pc" "${prefix}/mic/lib/pkgconfig/starpu-1.3-mic.pc"
 EOF
@@ -199,6 +216,7 @@ else
 cat > Makefile << EOF
 all:
 	\$(MAKE) -C build_host
+	$MIC_BUILD_ENV
 	\$(MAKE) -C build_mic
 
 clean:
@@ -210,6 +228,7 @@ distclean: clean
 
 check:
 	\$(MAKE) -C build_host check
+	$MIC_BUILD_ENV
 	\$(MAKE) -C build_mic check ; \
 	RET=\$\$? ; \
 	STARPU_NCPUS=0 \$(MAKE) -C build_mic check && [ \$\$RET == 0 ]
@@ -220,6 +239,7 @@ showcheck:
 
 install:
 	\$(MAKE) -C build_host install
+	$MIC_BUILD_ENV
 	\$(MAKE) -C build_mic install
 	ln -sf "${prefix}/mic/lib/pkgconfig/starpu-1.3.pc" "${prefix}/mic/lib/pkgconfig/starpu-1.3-mic.pc"
 EOF

+ 30 - 0
min-dgels/Makefile.in

@@ -0,0 +1,30 @@
+CLAPACK=base
+ADDITIONAL=additional
+
+all:
+	mkdir -p build
+	cd $(CLAPACK) && $(MAKE) blaslib
+	cd $(CLAPACK) && $(MAKE) f2clib
+	cd $(ADDITIONAL) && gcc -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
+
+install:
+
+clean:
+	-cd $(CLAPACK) && $(MAKE) clean && rm -rf *~
+	-cd $(ADDITIONAL) && rm -rf *.o *~
+	rm -rf build *~
+
+distclean: clean
+
+# This part is needed by StarPU
+
+STARPU_SRCDIR = @srcdir@
+
+distdir:
+	cp -fRp $(STARPU_SRCDIR)/* $(distdir)
+	cd $(distdir) && make -f Makefile.in clean
+
+check:
+	echo "No checks are implemented for min-dgels"
+
+showcheck: check

+ 160 - 0
min-dgels/additional/blaswrap.h

@@ -0,0 +1,160 @@
+/* CLAPACK 3.0 BLAS wrapper macros
+ * Feb 5, 2000
+ */
+
+#ifndef __BLASWRAP_H
+#define __BLASWRAP_H
+
+#ifndef NO_BLAS_WRAP
+ 
+/* BLAS1 routines */
+#define srotg_ f2c_srotg
+#define crotg_ f2c_crotg
+#define drotg_ f2c_drotg
+#define zrotg_ f2c_zrotg
+#define srotmg_ f2c_srotmg
+#define drotmg_ f2c_drotmg
+#define srot_ f2c_srot
+#define drot_ f2c_drot
+#define srotm_ f2c_srotm
+#define drotm_ f2c_drotm
+#define sswap_ f2c_sswap
+#define dswap_ f2c_dswap
+#define cswap_ f2c_cswap
+#define zswap_ f2c_zswap
+#define sscal_ f2c_sscal
+#define dscal_ f2c_dscal
+#define cscal_ f2c_cscal
+#define zscal_ f2c_zscal
+#define csscal_ f2c_csscal
+#define zdscal_ f2c_zdscal
+#define scopy_ f2c_scopy
+#define dcopy_ f2c_dcopy
+#define ccopy_ f2c_ccopy
+#define zcopy_ f2c_zcopy
+#define saxpy_ f2c_saxpy
+#define daxpy_ f2c_daxpy
+#define caxpy_ f2c_caxpy
+#define zaxpy_ f2c_zaxpy
+#define sdot_ f2c_sdot
+#define ddot_ f2c_ddot
+#define cdotu_ f2c_cdotu
+#define zdotu_ f2c_zdotu
+#define cdotc_ f2c_cdotc
+#define zdotc_ f2c_zdotc
+#define snrm2_ f2c_snrm2
+#define dnrm2_ f2c_dnrm2
+#define scnrm2_ f2c_scnrm2
+#define dznrm2_ f2c_dznrm2
+#define sasum_ f2c_sasum
+#define dasum_ f2c_dasum
+#define scasum_ f2c_scasum
+#define dzasum_ f2c_dzasum
+#define isamax_ f2c_isamax
+#define idamax_ f2c_idamax
+#define icamax_ f2c_icamax
+#define izamax_ f2c_izamax
+ 
+/* BLAS2 routines */
+#define sgemv_ f2c_sgemv
+#define dgemv_ f2c_dgemv
+#define cgemv_ f2c_cgemv
+#define zgemv_ f2c_zgemv
+#define sgbmv_ f2c_sgbmv
+#define dgbmv_ f2c_dgbmv
+#define cgbmv_ f2c_cgbmv
+#define zgbmv_ f2c_zgbmv
+#define chemv_ f2c_chemv
+#define zhemv_ f2c_zhemv
+#define chbmv_ f2c_chbmv
+#define zhbmv_ f2c_zhbmv
+#define chpmv_ f2c_chpmv
+#define zhpmv_ f2c_zhpmv
+#define ssymv_ f2c_ssymv
+#define dsymv_ f2c_dsymv
+#define ssbmv_ f2c_ssbmv
+#define dsbmv_ f2c_dsbmv
+#define sspmv_ f2c_sspmv
+#define dspmv_ f2c_dspmv
+#define strmv_ f2c_strmv
+#define dtrmv_ f2c_dtrmv
+#define ctrmv_ f2c_ctrmv
+#define ztrmv_ f2c_ztrmv
+#define stbmv_ f2c_stbmv
+#define dtbmv_ f2c_dtbmv
+#define ctbmv_ f2c_ctbmv
+#define ztbmv_ f2c_ztbmv
+#define stpmv_ f2c_stpmv
+#define dtpmv_ f2c_dtpmv
+#define ctpmv_ f2c_ctpmv
+#define ztpmv_ f2c_ztpmv
+#define strsv_ f2c_strsv
+#define dtrsv_ f2c_dtrsv
+#define ctrsv_ f2c_ctrsv
+#define ztrsv_ f2c_ztrsv
+#define stbsv_ f2c_stbsv
+#define dtbsv_ f2c_dtbsv
+#define ctbsv_ f2c_ctbsv
+#define ztbsv_ f2c_ztbsv
+#define stpsv_ f2c_stpsv
+#define dtpsv_ f2c_dtpsv
+#define ctpsv_ f2c_ctpsv
+#define ztpsv_ f2c_ztpsv
+#define sger_ f2c_sger
+#define dger_ f2c_dger
+#define cgeru_ f2c_cgeru
+#define zgeru_ f2c_zgeru
+#define cgerc_ f2c_cgerc
+#define zgerc_ f2c_zgerc
+#define cher_ f2c_cher
+#define zher_ f2c_zher
+#define chpr_ f2c_chpr
+#define zhpr_ f2c_zhpr
+#define cher2_ f2c_cher2
+#define zher2_ f2c_zher2
+#define chpr2_ f2c_chpr2
+#define zhpr2_ f2c_zhpr2
+#define ssyr_ f2c_ssyr
+#define dsyr_ f2c_dsyr
+#define sspr_ f2c_sspr
+#define dspr_ f2c_dspr
+#define ssyr2_ f2c_ssyr2
+#define dsyr2_ f2c_dsyr2
+#define sspr2_ f2c_sspr2
+#define dspr2_ f2c_dspr2
+ 
+/* BLAS3 routines */
+#define sgemm_ f2c_sgemm
+#define dgemm_ f2c_dgemm
+#define cgemm_ f2c_cgemm
+#define zgemm_ f2c_zgemm
+#define ssymm_ f2c_ssymm
+#define dsymm_ f2c_dsymm
+#define csymm_ f2c_csymm
+#define zsymm_ f2c_zsymm
+#define chemm_ f2c_chemm
+#define zhemm_ f2c_zhemm
+#define ssyrk_ f2c_ssyrk
+#define dsyrk_ f2c_dsyrk
+#define csyrk_ f2c_csyrk
+#define zsyrk_ f2c_zsyrk
+#define cherk_ f2c_cherk
+#define zherk_ f2c_zherk
+#define ssyr2k_ f2c_ssyr2k
+#define dsyr2k_ f2c_dsyr2k
+#define csyr2k_ f2c_csyr2k
+#define zsyr2k_ f2c_zsyr2k
+#define cher2k_ f2c_cher2k
+#define zher2k_ f2c_zher2k
+#define strmm_ f2c_strmm
+#define dtrmm_ f2c_dtrmm
+#define ctrmm_ f2c_ctrmm
+#define ztrmm_ f2c_ztrmm
+#define strsm_ f2c_strsm
+#define dtrsm_ f2c_dtrsm
+#define ctrsm_ f2c_ctrsm
+#define ztrsm_ f2c_ztrsm
+
+#endif /* NO_BLAS_WRAP */
+
+#endif /* __BLASWRAP_H */

Разница между файлами не показана из-за своего большого размера
+ 7262 - 0
min-dgels/additional/clapack.h


+ 21 - 0
min-dgels/additional/d_lg10.c

@@ -0,0 +1,21 @@
+#include "f2c.h"
+
+#define log10e 0.43429448190325182765
+
+#ifdef KR_headers
+double log();
+double d_lg10(x) doublereal *x;
+#else
+#undef abs
+#include "math.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+double d_lg10(doublereal *x)
+#endif
+{
+return( log10e * log(*x) );
+}
+#ifdef __cplusplus
+}
+#endif

+ 18 - 0
min-dgels/additional/d_sign.c

@@ -0,0 +1,18 @@
+#include "f2c.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef KR_headers
+double d_sign(a,b) doublereal *a, *b;
+#else
+double d_sign(doublereal *a, doublereal *b)
+#endif
+{
+double x;
+x = (*a >= 0 ? *a : - *a);
+return( *b >= 0 ? x : -x);
+}
+#ifdef __cplusplus
+}
+#endif

+ 107 - 0
min-dgels/additional/dcopy.c

@@ -0,0 +1,107 @@
+/* dcopy.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dcopy_(integer *n, doublereal *dx, integer *incx, 
+	doublereal *dy, integer *incy)
+{
+    /* System generated locals */
+    integer i__1;
+
+    /* Local variables */
+    integer i__, m, ix, iy, mp1;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     copies a vector, x, to a vector, y. */
+/*     uses unrolled loops for increments equal to one. */
+/*     jack dongarra, linpack, 3/11/78. */
+/*     modified 12/3/93, array(1) declarations changed to array(*) */
+
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+    /* Parameter adjustments */
+    --dy;
+    --dx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*        code for unequal increments or equal increments */
+/*          not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dy[iy] = dx[ix];
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*        code for both increments equal to 1 */
+
+
+/*        clean-up loop */
+
+L20:
+    m = *n % 7;
+    if (m == 0) {
+	goto L40;
+    }
+    i__1 = m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	dy[i__] = dx[i__];
+/* L30: */
+    }
+    if (*n < 7) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__1 = *n;
+    for (i__ = mp1; i__ <= i__1; i__ += 7) {
+	dy[i__] = dx[i__];
+	dy[i__ + 1] = dx[i__ + 1];
+	dy[i__ + 2] = dx[i__ + 2];
+	dy[i__ + 3] = dx[i__ + 3];
+	dy[i__ + 4] = dx[i__ + 4];
+	dy[i__ + 5] = dx[i__ + 5];
+	dy[i__ + 6] = dx[i__ + 6];
+/* L50: */
+    }
+    return 0;
+} /* dcopy_ */

+ 157 - 0
min-dgels/additional/dgelq2.c

@@ -0,0 +1,157 @@
+/* dgelq2.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dgelq2_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *tau, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    integer i__, k;
+    doublereal aii;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *), dlarfp_(integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *), xerbla_(char *, integer *);
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DGELQ2 computes an LQ factorization of a real m by n matrix A: */
+/*  A = L * Q. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A.  M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A.  N >= 0. */
+
+/*  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          On entry, the m by n matrix A. */
+/*          On exit, the elements on and below the diagonal of the array */
+/*          contain the m by min(m,n) lower trapezoidal matrix L (L is */
+/*          lower triangular if m <= n); the elements above the diagonal, */
+/*          with the array TAU, represent the orthogonal matrix Q as a */
+/*          product of elementary reflectors (see Further Details). */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(1,M). */
+
+/*  TAU     (output) DOUBLE PRECISION array, dimension (min(M,N)) */
+/*          The scalar factors of the elementary reflectors (see Further */
+/*          Details). */
+
+/*  WORK    (workspace) DOUBLE PRECISION array, dimension (M) */
+
+/*  INFO    (output) INTEGER */
+/*          = 0: successful exit */
+/*          < 0: if INFO = -i, the i-th argument had an illegal value */
+
+/*  Further Details */
+/*  =============== */
+
+/*  The matrix Q is represented as a product of elementary reflectors */
+
+/*     Q = H(k) . . . H(2) H(1), where k = min(m,n). */
+
+/*  Each H(i) has the form */
+
+/*     H(i) = I - tau * v * v' */
+
+/*  where tau is a real scalar, and v is a real vector with */
+/*  v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), */
+/*  and tau in TAU(i). */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELQ2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i,i+1:n) */
+
+	i__2 = *n - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	dlarfp_(&i__2, &a[i__ + i__ * a_dim1], &a[i__ + min(i__3, *n)* a_dim1]
+, lda, &tau[i__]);
+	if (i__ < *m) {
+
+/*           Apply H(i) to A(i+1:m,i:n) from the right */
+
+	    aii = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.;
+	    i__2 = *m - i__;
+	    i__3 = *n - i__ + 1;
+	    dlarf_("Right", &i__2, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[
+		    i__], &a[i__ + 1 + i__ * a_dim1], lda, &work[1]);
+	    a[i__ + i__ * a_dim1] = aii;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of DGELQ2 */
+
+} /* dgelq2_ */

+ 251 - 0
min-dgels/additional/dgelqf.c

@@ -0,0 +1,251 @@
+/* dgelqf.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int dgelqf_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *tau, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int dgelq2_(integer *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *), dlarfb_(char *, 
+	     char *, char *, char *, integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal 
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *);
+    integer ldwork, lwkopt;
+    logical lquery;
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DGELQF computes an LQ factorization of a real M-by-N matrix A: */
+/*  A = L * Q. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A.  M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A.  N >= 0. */
+
+/*  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          On entry, the M-by-N matrix A. */
+/*          On exit, the elements on and below the diagonal of the array */
+/*          contain the m-by-min(m,n) lower trapezoidal matrix L (L is */
+/*          lower triangular if m <= n); the elements above the diagonal, */
+/*          with the array TAU, represent the orthogonal matrix Q as a */
+/*          product of elementary reflectors (see Further Details). */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(1,M). */
+
+/*  TAU     (output) DOUBLE PRECISION array, dimension (min(M,N)) */
+/*          The scalar factors of the elementary reflectors (see Further */
+/*          Details). */
+
+/*  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */
+/*          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+
+/*  LWORK   (input) INTEGER */
+/*          The dimension of the array WORK.  LWORK >= max(1,M). */
+/*          For optimum performance LWORK >= M*NB, where NB is the */
+/*          optimal blocksize. */
+
+/*          If LWORK = -1, then a workspace query is assumed; the routine */
+/*          only calculates the optimal size of the WORK array, returns */
+/*          this value as the first entry of the WORK array, and no error */
+/*          message related to LWORK is issued by XERBLA. */
+
+/*  INFO    (output) INTEGER */
+/*          = 0:  successful exit */
+/*          < 0:  if INFO = -i, the i-th argument had an illegal value */
+
+/*  Further Details */
+/*  =============== */
+
+/*  The matrix Q is represented as a product of elementary reflectors */
+
+/*     Q = H(k) . . . H(2) H(1), where k = min(m,n). */
+
+/*  Each H(i) has the form */
+
+/*     H(i) = I - tau * v * v' */
+
+/*  where tau is a real scalar, and v is a real vector with */
+/*  v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), */
+/*  and tau in TAU(i). */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "DGELQF", " ", m, n, &c_n1, &c_n1);
+    lwkopt = *m * nb;
+    work[1] = (doublereal) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*m) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELQF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *m;
+    if (nb > 1 && nb < k) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DGELQF", " ", m, n, &c_n1, &c_n1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *m;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*              Not enough workspace to use optimal NB:  reduce NB and */
+/*              determine the minimum value of NB. */
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DGELQF", " ", m, n, &c_n1, &
+			c_n1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*           Compute the LQ factorization of the current block */
+/*           A(i:i+ib-1,i:n) */
+
+	    i__3 = *n - i__ + 1;
+	    dgelq2_(&ib, &i__3, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *m) {
+
+/*              Form the triangular factor of the block reflector */
+/*              H = H(i) H(i+1) . . . H(i+ib-1) */
+
+		i__3 = *n - i__ + 1;
+		dlarft_("Forward", "Rowwise", &i__3, &ib, &a[i__ + i__ * 
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H to A(i+ib:m,i:n) from the right */
+
+		i__3 = *m - i__ - ib + 1;
+		i__4 = *n - i__ + 1;
+		dlarfb_("Right", "No transpose", "Forward", "Rowwise", &i__3, 
+			&i__4, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + ib + i__ * a_dim1], lda, &work[ib + 
+			1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	dgelq2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+, &iinfo);
+    }
+
+    work[1] = (doublereal) iws;
+    return 0;
+
+/*     End of DGELQF */
+
+} /* dgelqf_ */

+ 515 - 0
min-dgels/additional/dgels.c

@@ -0,0 +1,515 @@
+/* dgels.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static doublereal c_b33 = 0.;
+static integer c__0 = 0;
+
+/* Subroutine */ int dgels_(char *trans, integer *m, integer *n, integer *
+	nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, 
+	doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
+
+    /* Local variables */
+    integer i__, j, nb, mn;
+    doublereal anrm, bnrm;
+    integer brow;
+    logical tpsd;
+    integer iascl, ibscl;
+    extern logical lsame_(char *, char *);
+    integer wsize;
+    doublereal rwork[1];
+    extern /* Subroutine */ int dlabad_(doublereal *, doublereal *);
+    extern doublereal dlamch_(char *), dlange_(char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *);
+    extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *, integer *), 
+	    dlascl_(char *, integer *, integer *, doublereal *, doublereal *, 
+	    integer *, integer *, doublereal *, integer *, integer *),
+	     dgeqrf_(integer *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *), dlaset_(char *, 
+	     integer *, integer *, doublereal *, doublereal *, doublereal *, 
+	    integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *);
+    integer scllen;
+    doublereal bignum;
+    extern /* Subroutine */ int dormlq_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *, integer *), 
+	    dormqr_(char *, char *, integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *);
+    doublereal smlnum;
+    logical lquery;
+    extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *, integer *, 
+	    integer *);
+
+
+/*  -- LAPACK driver routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DGELS solves overdetermined or underdetermined real linear systems */
+/*  involving an M-by-N matrix A, or its transpose, using a QR or LQ */
+/*  factorization of A.  It is assumed that A has full rank. */
+
+/*  The following options are provided: */
+
+/*  1. If TRANS = 'N' and m >= n:  find the least squares solution of */
+/*     an overdetermined system, i.e., solve the least squares problem */
+/*                  minimize || B - A*X ||. */
+
+/*  2. If TRANS = 'N' and m < n:  find the minimum norm solution of */
+/*     an underdetermined system A * X = B. */
+
+/*  3. If TRANS = 'T' and m >= n:  find the minimum norm solution of */
+/*     an undetermined system A**T * X = B. */
+
+/*  4. If TRANS = 'T' and m < n:  find the least squares solution of */
+/*     an overdetermined system, i.e., solve the least squares problem */
+/*                  minimize || B - A**T * X ||. */
+
+/*  Several right hand side vectors b and solution vectors x can be */
+/*  handled in a single call; they are stored as the columns of the */
+/*  M-by-NRHS right hand side matrix B and the N-by-NRHS solution */
+/*  matrix X. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  TRANS   (input) CHARACTER*1 */
+/*          = 'N': the linear system involves A; */
+/*          = 'T': the linear system involves A**T. */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A.  M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A.  N >= 0. */
+
+/*  NRHS    (input) INTEGER */
+/*          The number of right hand sides, i.e., the number of */
+/*          columns of the matrices B and X. NRHS >=0. */
+
+/*  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          On entry, the M-by-N matrix A. */
+/*          On exit, */
+/*            if M >= N, A is overwritten by details of its QR */
+/*                       factorization as returned by DGEQRF; */
+/*            if M <  N, A is overwritten by details of its LQ */
+/*                       factorization as returned by DGELQF. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(1,M). */
+
+/*  B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) */
+/*          On entry, the matrix B of right hand side vectors, stored */
+/*          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */
+/*          if TRANS = 'T'. */
+/*          On exit, if INFO = 0, B is overwritten by the solution */
+/*          vectors, stored columnwise: */
+/*          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */
+/*          squares solution vectors; the residual sum of squares for the */
+/*          solution in each column is given by the sum of squares of */
+/*          elements N+1 to M in that column; */
+/*          if TRANS = 'N' and m < n, rows 1 to N of B contain the */
+/*          minimum norm solution vectors; */
+/*          if TRANS = 'T' and m >= n, rows 1 to M of B contain the */
+/*          minimum norm solution vectors; */
+/*          if TRANS = 'T' and m < n, rows 1 to M of B contain the */
+/*          least squares solution vectors; the residual sum of squares */
+/*          for the solution in each column is given by the sum of */
+/*          squares of elements M+1 to N in that column. */
+
+/*  LDB     (input) INTEGER */
+/*          The leading dimension of the array B. LDB >= MAX(1,M,N). */
+
+/*  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */
+/*          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+
+/*  LWORK   (input) INTEGER */
+/*          The dimension of the array WORK. */
+/*          LWORK >= max( 1, MN + max( MN, NRHS ) ). */
+/*          For optimal performance, */
+/*          LWORK >= max( 1, MN + max( MN, NRHS )*NB ). */
+/*          where MN = min(M,N) and NB is the optimum block size. */
+
+/*          If LWORK = -1, then a workspace query is assumed; the routine */
+/*          only calculates the optimal size of the WORK array, returns */
+/*          this value as the first entry of the WORK array, and no error */
+/*          message related to LWORK is issued by XERBLA. */
+
+/*  INFO    (output) INTEGER */
+/*          = 0:  successful exit */
+/*          < 0:  if INFO = -i, the i-th argument had an illegal value */
+/*          > 0:  if INFO =  i, the i-th diagonal element of the */
+/*                triangular factor of A is zero, so that A does not have */
+/*                full rank; the least squares solution could not be */
+/*                computed. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Local Arrays .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    mn = min(*m,*n);
+    lquery = *lwork == -1;
+    if (! (lsame_(trans, "N") || lsame_(trans, "T"))) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 0) {
+	*info = -4;
+    } else if (*lda < max(1,*m)) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = max(1,*m);
+	if (*ldb < max(i__1,*n)) {
+	    *info = -8;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = 1, i__2 = mn + max(mn,*nrhs);
+	    if (*lwork < max(i__1,i__2) && ! lquery) {
+		*info = -10;
+	    }
+	}
+    }
+
+/*     Figure out optimal block size */
+
+    if (*info == 0 || *info == -10) {
+
+	tpsd = TRUE_;
+	if (lsame_(trans, "N")) {
+	    tpsd = FALSE_;
+	}
+
+	if (*m >= *n) {
+	    nb = ilaenv_(&c__1, "DGEQRF", " ", m, n, &c_n1, &c_n1);
+	    if (tpsd) {
+/* Computing MAX */
+		i__1 = nb, i__2 = ilaenv_(&c__1, "DORMQR", "LN", m, nrhs, n, &
+			c_n1);
+		nb = max(i__1,i__2);
+	    } else {
+/* Computing MAX */
+		i__1 = nb, i__2 = ilaenv_(&c__1, "DORMQR", "LT", m, nrhs, n, &
+			c_n1);
+		nb = max(i__1,i__2);
+	    }
+	} else {
+	    nb = ilaenv_(&c__1, "DGELQF", " ", m, n, &c_n1, &c_n1);
+	    if (tpsd) {
+/* Computing MAX */
+		i__1 = nb, i__2 = ilaenv_(&c__1, "DORMLQ", "LT", n, nrhs, m, &
+			c_n1);
+		nb = max(i__1,i__2);
+	    } else {
+/* Computing MAX */
+		i__1 = nb, i__2 = ilaenv_(&c__1, "DORMLQ", "LN", n, nrhs, m, &
+			c_n1);
+		nb = max(i__1,i__2);
+	    }
+	}
+
+/* Computing MAX */
+	i__1 = 1, i__2 = mn + max(mn,*nrhs) * nb;
+	wsize = max(i__1,i__2);
+	work[1] = (doublereal) wsize;
+
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELS ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+/* Computing MIN */
+    i__1 = min(*m,*n);
+    if (min(i__1,*nrhs) == 0) {
+	i__1 = max(*m,*n);
+	dlaset_("Full", &i__1, nrhs, &c_b33, &c_b33, &b[b_offset], ldb);
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = dlamch_("S") / dlamch_("P");
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+
+/*     Scale A, B if max element outside range [SMLNUM,BIGNUM] */
+
+    anrm = dlange_("M", m, n, &a[a_offset], lda, rwork);
+    iascl = 0;
+    if (anrm > 0. && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	dlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	dlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 2;
+    } else if (anrm == 0.) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = max(*m,*n);
+	dlaset_("F", &i__1, nrhs, &c_b33, &c_b33, &b[b_offset], ldb);
+	goto L50;
+    }
+
+    brow = *m;
+    if (tpsd) {
+	brow = *n;
+    }
+    bnrm = dlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork);
+    ibscl = 0;
+    if (bnrm > 0. && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	dlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	dlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 2;
+    }
+
+    if (*m >= *n) {
+
+/*        compute QR factorization of A */
+
+	i__1 = *lwork - mn;
+	dgeqrf_(m, n, &a[a_offset], lda, &work[1], &work[mn + 1], &i__1, info)
+		;
+
+/*        workspace at least N, optimally N*NB */
+
+	if (! tpsd) {
+
+/*           Least-Squares Problem min || A * X - B || */
+
+/*           B(1:M,1:NRHS) := Q' * B(1:M,1:NRHS) */
+
+	    i__1 = *lwork - mn;
+	    dormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &work[
+		    1], &b[b_offset], ldb, &work[mn + 1], &i__1, info);
+
+/*           workspace at least NRHS, optimally NRHS*NB */
+
+/*           B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */
+
+	    dtrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset]
+, lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *n;
+
+	} else {
+
+/*           Overdetermined system of equations A' * X = B */
+
+/*           B(1:N,1:NRHS) := inv(R') * B(1:N,1:NRHS) */
+
+	    dtrtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], 
+		    lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           B(N+1:M,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = *n + 1; i__ <= i__2; ++i__) {
+		    b[i__ + j * b_dim1] = 0.;
+/* L10: */
+		}
+/* L20: */
+	    }
+
+/*           B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS) */
+
+	    i__1 = *lwork - mn;
+	    dormqr_("Left", "No transpose", m, nrhs, n, &a[a_offset], lda, &
+		    work[1], &b[b_offset], ldb, &work[mn + 1], &i__1, info);
+
+/*           workspace at least NRHS, optimally NRHS*NB */
+
+	    scllen = *m;
+
+	}
+
+    } else {
+
+/*        Compute LQ factorization of A */
+
+	i__1 = *lwork - mn;
+	dgelqf_(m, n, &a[a_offset], lda, &work[1], &work[mn + 1], &i__1, info)
+		;
+
+/*        workspace at least M, optimally M*NB. */
+
+	if (! tpsd) {
+
+/*           underdetermined system of equations A * X = B */
+
+/*           B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */
+
+	    dtrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset]
+, lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           B(M+1:N,1:NRHS) = 0 */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		    b[i__ + j * b_dim1] = 0.;
+/* L30: */
+		}
+/* L40: */
+	    }
+
+/*           B(1:N,1:NRHS) := Q(1:N,:)' * B(1:M,1:NRHS) */
+
+	    i__1 = *lwork - mn;
+	    dormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &work[
+		    1], &b[b_offset], ldb, &work[mn + 1], &i__1, info);
+
+/*           workspace at least NRHS, optimally NRHS*NB */
+
+	    scllen = *n;
+
+	} else {
+
+/*           overdetermined system min || A' * X - B || */
+
+/*           B(1:N,1:NRHS) := Q * B(1:N,1:NRHS) */
+
+	    i__1 = *lwork - mn;
+	    dormlq_("Left", "No transpose", n, nrhs, m, &a[a_offset], lda, &
+		    work[1], &b[b_offset], ldb, &work[mn + 1], &i__1, info);
+
+/*           workspace at least NRHS, optimally NRHS*NB */
+
+/*           B(1:M,1:NRHS) := inv(L') * B(1:M,1:NRHS) */
+
+	    dtrtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], 
+		    lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *m;
+
+	}
+
+    }
+
+/*     Undo scaling */
+
+    if (iascl == 1) {
+	dlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset]
+, ldb, info);
+    } else if (iascl == 2) {
+	dlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset]
+, ldb, info);
+    }
+    if (ibscl == 1) {
+	dlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset]
+, ldb, info);
+    } else if (ibscl == 2) {
+	dlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset]
+, ldb, info);
+    }
+
+L50:
+    work[1] = (doublereal) wsize;
+
+    return 0;
+
+/*     End of DGELS */
+
+} /* dgels_ */

+ 389 - 0
min-dgels/additional/dgemm.c

@@ -0,0 +1,389 @@
+/* dgemm.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dgemm_(char *transa, char *transb, integer *m, integer *
+	n, integer *k, doublereal *alpha, doublereal *a, integer *lda, 
+	doublereal *b, integer *ldb, doublereal *beta, doublereal *c__, 
+	integer *ldc)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3;
+
+    /* Local variables */
+    integer i__, j, l, info;
+    logical nota, notb;
+    doublereal temp;
+    integer ncola;
+    extern logical lsame_(char *, char *);
+    integer nrowa, nrowb;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DGEMM  performs one of the matrix-matrix operations */
+
+/*     C := alpha*op( A )*op( B ) + beta*C, */
+
+/*  where  op( X ) is one of */
+
+/*     op( X ) = X   or   op( X ) = X', */
+
+/*  alpha and beta are scalars, and A, B and C are matrices, with op( A ) */
+/*  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  TRANSA - CHARACTER*1. */
+/*           On entry, TRANSA specifies the form of op( A ) to be used in */
+/*           the matrix multiplication as follows: */
+
+/*              TRANSA = 'N' or 'n',  op( A ) = A. */
+
+/*              TRANSA = 'T' or 't',  op( A ) = A'. */
+
+/*              TRANSA = 'C' or 'c',  op( A ) = A'. */
+
+/*           Unchanged on exit. */
+
+/*  TRANSB - CHARACTER*1. */
+/*           On entry, TRANSB specifies the form of op( B ) to be used in */
+/*           the matrix multiplication as follows: */
+
+/*              TRANSB = 'N' or 'n',  op( B ) = B. */
+
+/*              TRANSB = 'T' or 't',  op( B ) = B'. */
+
+/*              TRANSB = 'C' or 'c',  op( B ) = B'. */
+
+/*           Unchanged on exit. */
+
+/*  M      - INTEGER. */
+/*           On entry,  M  specifies  the number  of rows  of the  matrix */
+/*           op( A )  and of the  matrix  C.  M  must  be at least  zero. */
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry,  N  specifies the number  of columns of the matrix */
+/*           op( B ) and the number of columns of the matrix C. N must be */
+/*           at least zero. */
+/*           Unchanged on exit. */
+
+/*  K      - INTEGER. */
+/*           On entry,  K  specifies  the number of columns of the matrix */
+/*           op( A ) and the number of rows of the matrix op( B ). K must */
+/*           be at least  zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is */
+/*           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise. */
+/*           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k */
+/*           part of the array  A  must contain the matrix  A,  otherwise */
+/*           the leading  k by m  part of the array  A  must contain  the */
+/*           matrix A. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. When  TRANSA = 'N' or 'n' then */
+/*           LDA must be at least  max( 1, m ), otherwise  LDA must be at */
+/*           least  max( 1, k ). */
+/*           Unchanged on exit. */
+
+/*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is */
+/*           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise. */
+/*           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n */
+/*           part of the array  B  must contain the matrix  B,  otherwise */
+/*           the leading  n by k  part of the array  B  must contain  the */
+/*           matrix B. */
+/*           Unchanged on exit. */
+
+/*  LDB    - INTEGER. */
+/*           On entry, LDB specifies the first dimension of B as declared */
+/*           in the calling (sub) program. When  TRANSB = 'N' or 'n' then */
+/*           LDB must be at least  max( 1, k ), otherwise  LDB must be at */
+/*           least  max( 1, n ). */
+/*           Unchanged on exit. */
+
+/*  BETA   - DOUBLE PRECISION. */
+/*           On entry,  BETA  specifies the scalar  beta.  When  BETA  is */
+/*           supplied as zero then C need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ). */
+/*           Before entry, the leading  m by n  part of the array  C must */
+/*           contain the matrix  C,  except when  beta  is zero, in which */
+/*           case C need not be set on entry. */
+/*           On exit, the array  C  is overwritten by the  m by n  matrix */
+/*           ( alpha*op( A )*op( B ) + beta*C ). */
+
+/*  LDC    - INTEGER. */
+/*           On entry, LDC specifies the first dimension of C as declared */
+/*           in  the  calling  (sub)  program.   LDC  must  be  at  least */
+/*           max( 1, m ). */
+/*           Unchanged on exit. */
+
+
+/*  Level 3 Blas routine. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Parameters .. */
+/*     .. */
+
+/*     Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not */
+/*     transposed and set  NROWA, NCOLA and  NROWB  as the number of rows */
+/*     and  columns of  A  and the  number of  rows  of  B  respectively. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+
+    /* Function Body */
+    nota = lsame_(transa, "N");
+    notb = lsame_(transb, "N");
+    if (nota) {
+	nrowa = *m;
+	ncola = *k;
+    } else {
+	nrowa = *k;
+	ncola = *m;
+    }
+    if (notb) {
+	nrowb = *k;
+    } else {
+	nrowb = *n;
+    }
+
+/*     Test the input parameters. */
+
+    info = 0;
+    if (! nota && ! lsame_(transa, "C") && ! lsame_(
+	    transa, "T")) {
+	info = 1;
+    } else if (! notb && ! lsame_(transb, "C") && ! 
+	    lsame_(transb, "T")) {
+	info = 2;
+    } else if (*m < 0) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*k < 0) {
+	info = 5;
+    } else if (*lda < max(1,nrowa)) {
+	info = 8;
+    } else if (*ldb < max(1,nrowb)) {
+	info = 10;
+    } else if (*ldc < max(1,*m)) {
+	info = 13;
+    }
+    if (info != 0) {
+	xerbla_("DGEMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) {
+	return 0;
+    }
+
+/*     And if  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	if (*beta == 0.) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    c__[i__ + j * c_dim1] = 0.;
+/* L10: */
+		}
+/* L20: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L30: */
+		}
+/* L40: */
+	    }
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (notb) {
+	if (nota) {
+
+/*           Form  C := alpha*A*B + beta*C. */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L50: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L60: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (b[l + j * b_dim1] != 0.) {
+			temp = *alpha * b[l + j * b_dim1];
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l * 
+				    a_dim1];
+/* L70: */
+			}
+		    }
+/* L80: */
+		}
+/* L90: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * b[l + j * b_dim1];
+/* L100: */
+		    }
+		    if (*beta == 0.) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L110: */
+		}
+/* L120: */
+	    }
+	}
+    } else {
+	if (nota) {
+
+/*           Form  C := alpha*A*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (*beta == 0.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = 0.;
+/* L130: */
+		    }
+		} else if (*beta != 1.) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] = *beta * c__[i__ + j * c_dim1];
+/* L140: */
+		    }
+		}
+		i__2 = *k;
+		for (l = 1; l <= i__2; ++l) {
+		    if (b[j + l * b_dim1] != 0.) {
+			temp = *alpha * b[j + l * b_dim1];
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    c__[i__ + j * c_dim1] += temp * a[i__ + l * 
+				    a_dim1];
+/* L150: */
+			}
+		    }
+/* L160: */
+		}
+/* L170: */
+	    }
+	} else {
+
+/*           Form  C := alpha*A'*B' + beta*C */
+
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp = 0.;
+		    i__3 = *k;
+		    for (l = 1; l <= i__3; ++l) {
+			temp += a[l + i__ * a_dim1] * b[j + l * b_dim1];
+/* L180: */
+		    }
+		    if (*beta == 0.) {
+			c__[i__ + j * c_dim1] = *alpha * temp;
+		    } else {
+			c__[i__ + j * c_dim1] = *alpha * temp + *beta * c__[
+				i__ + j * c_dim1];
+		    }
+/* L190: */
+		}
+/* L200: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DGEMM . */
+
+} /* dgemm_ */

+ 312 - 0
min-dgels/additional/dgemv.c

@@ -0,0 +1,312 @@
+/* dgemv.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dgemv_(char *trans, integer *m, integer *n, doublereal *
+	alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, 
+	doublereal *beta, doublereal *y, integer *incy)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    integer i__, j, ix, iy, jx, jy, kx, ky, info;
+    doublereal temp;
+    integer lenx, leny;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DGEMV  performs one of the matrix-vector operations */
+
+/*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y, */
+
+/*  where alpha and beta are scalars, x and y are vectors and A is an */
+/*  m by n matrix. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   y := alpha*A*x + beta*y. */
+
+/*              TRANS = 'T' or 't'   y := alpha*A'*x + beta*y. */
+
+/*              TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y. */
+
+/*           Unchanged on exit. */
+
+/*  M      - INTEGER. */
+/*           On entry, M specifies the number of rows of the matrix A. */
+/*           M must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the number of columns of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
+/*           Before entry, the leading m by n part of the array A must */
+/*           contain the matrix of coefficients. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           max( 1, m ). */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of DIMENSION at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' */
+/*           and at least */
+/*           ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. */
+/*           Before entry, the incremented array X must contain the */
+/*           vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  BETA   - DOUBLE PRECISION. */
+/*           On entry, BETA specifies the scalar beta. When BETA is */
+/*           supplied as zero then Y need not be set on input. */
+/*           Unchanged on exit. */
+
+/*  Y      - DOUBLE PRECISION array of DIMENSION at least */
+/*           ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' */
+/*           and at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. */
+/*           Before entry with BETA non-zero, the incremented array Y */
+/*           must contain the vector y. On exit, Y is overwritten by the */
+/*           updated vector y. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+    --y;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(trans, "N") && ! lsame_(trans, "T") && ! lsame_(trans, "C")
+	    ) {
+	info = 1;
+    } else if (*m < 0) {
+	info = 2;
+    } else if (*n < 0) {
+	info = 3;
+    } else if (*lda < max(1,*m)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    } else if (*incy == 0) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("DGEMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) {
+	return 0;
+    }
+
+/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
+/*     up the start points in  X  and  Y. */
+
+    if (lsame_(trans, "N")) {
+	lenx = *n;
+	leny = *m;
+    } else {
+	lenx = *m;
+	leny = *n;
+    }
+    if (*incx > 0) {
+	kx = 1;
+    } else {
+	kx = 1 - (lenx - 1) * *incx;
+    }
+    if (*incy > 0) {
+	ky = 1;
+    } else {
+	ky = 1 - (leny - 1) * *incy;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+/*     First form  y := beta*y. */
+
+    if (*beta != 1.) {
+	if (*incy == 1) {
+	    if (*beta == 0.) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = 0.;
+/* L10: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[i__] = *beta * y[i__];
+/* L20: */
+		}
+	    }
+	} else {
+	    iy = ky;
+	    if (*beta == 0.) {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = 0.;
+		    iy += *incy;
+/* L30: */
+		}
+	    } else {
+		i__1 = leny;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+		    y[iy] = *beta * y[iy];
+		    iy += *incy;
+/* L40: */
+		}
+	    }
+	}
+    }
+    if (*alpha == 0.) {
+	return 0;
+    }
+    if (lsame_(trans, "N")) {
+
+/*        Form  y := alpha*A*x + y. */
+
+	jx = kx;
+	if (*incy == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0.) {
+		    temp = *alpha * x[jx];
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			y[i__] += temp * a[i__ + j * a_dim1];
+/* L50: */
+		    }
+		}
+		jx += *incx;
+/* L60: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		if (x[jx] != 0.) {
+		    temp = *alpha * x[jx];
+		    iy = ky;
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			y[iy] += temp * a[i__ + j * a_dim1];
+			iy += *incy;
+/* L70: */
+		    }
+		}
+		jx += *incx;
+/* L80: */
+	    }
+	}
+    } else {
+
+/*        Form  y := alpha*A'*x + y. */
+
+	jy = ky;
+	if (*incx == 1) {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = 0.;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp += a[i__ + j * a_dim1] * x[i__];
+/* L90: */
+		}
+		y[jy] += *alpha * temp;
+		jy += *incy;
+/* L100: */
+	    }
+	} else {
+	    i__1 = *n;
+	    for (j = 1; j <= i__1; ++j) {
+		temp = 0.;
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    temp += a[i__ + j * a_dim1] * x[ix];
+		    ix += *incx;
+/* L110: */
+		}
+		y[jy] += *alpha * temp;
+		jy += *incy;
+/* L120: */
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DGEMV . */
+
+} /* dgemv_ */

+ 161 - 0
min-dgels/additional/dgeqr2.c

@@ -0,0 +1,161 @@
+/* dgeqr2.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int dgeqr2_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *tau, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    integer i__, k;
+    doublereal aii;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *), dlarfp_(integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *), xerbla_(char *, integer *);
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DGEQR2 computes a QR factorization of a real m by n matrix A: */
+/*  A = Q * R. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A.  M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A.  N >= 0. */
+
+/*  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          On entry, the m by n matrix A. */
+/*          On exit, the elements on and above the diagonal of the array */
+/*          contain the min(m,n) by n upper trapezoidal matrix R (R is */
+/*          upper triangular if m >= n); the elements below the diagonal, */
+/*          with the array TAU, represent the orthogonal matrix Q as a */
+/*          product of elementary reflectors (see Further Details). */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(1,M). */
+
+/*  TAU     (output) DOUBLE PRECISION array, dimension (min(M,N)) */
+/*          The scalar factors of the elementary reflectors (see Further */
+/*          Details). */
+
+/*  WORK    (workspace) DOUBLE PRECISION array, dimension (N) */
+
+/*  INFO    (output) INTEGER */
+/*          = 0: successful exit */
+/*          < 0: if INFO = -i, the i-th argument had an illegal value */
+
+/*  Further Details */
+/*  =============== */
+
+/*  The matrix Q is represented as a product of elementary reflectors */
+
+/*     Q = H(1) H(2) . . . H(k), where k = min(m,n). */
+
+/*  Each H(i) has the form */
+
+/*     H(i) = I - tau * v * v' */
+
+/*  where tau is a real scalar, and v is a real vector with */
+/*  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), */
+/*  and tau in TAU(i). */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQR2", &i__1);
+	return 0;
+    }
+
+    k = min(*m,*n);
+
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+
+	i__2 = *m - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	dlarfp_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3, *m)+ i__ * a_dim1]
+, &c__1, &tau[i__]);
+	if (i__ < *n) {
+
+/*           Apply H(i) to A(i:m,i+1:n) from the left */
+
+	    aii = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n - i__;
+	    dlarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &tau[
+		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + i__ * a_dim1] = aii;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of DGEQR2 */
+
+} /* dgeqr2_ */

+ 252 - 0
min-dgels/additional/dgeqrf.c

@@ -0,0 +1,252 @@
+/* dgeqrf.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int dgeqrf_(integer *m, integer *n, doublereal *a, integer *
+	lda, doublereal *tau, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int dgeqr2_(integer *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *), dlarfb_(char *, 
+	     char *, char *, char *, integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal 
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *);
+    integer ldwork, lwkopt;
+    logical lquery;
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DGEQRF computes a QR factorization of a real M-by-N matrix A: */
+/*  A = Q * R. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A.  M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A.  N >= 0. */
+
+/*  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          On entry, the M-by-N matrix A. */
+/*          On exit, the elements on and above the diagonal of the array */
+/*          contain the min(M,N)-by-N upper trapezoidal matrix R (R is */
+/*          upper triangular if m >= n); the elements below the diagonal, */
+/*          with the array TAU, represent the orthogonal matrix Q as a */
+/*          product of min(m,n) elementary reflectors (see Further */
+/*          Details). */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(1,M). */
+
+/*  TAU     (output) DOUBLE PRECISION array, dimension (min(M,N)) */
+/*          The scalar factors of the elementary reflectors (see Further */
+/*          Details). */
+
+/*  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */
+/*          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+
+/*  LWORK   (input) INTEGER */
+/*          The dimension of the array WORK.  LWORK >= max(1,N). */
+/*          For optimum performance LWORK >= N*NB, where NB is */
+/*          the optimal blocksize. */
+
+/*          If LWORK = -1, then a workspace query is assumed; the routine */
+/*          only calculates the optimal size of the WORK array, returns */
+/*          this value as the first entry of the WORK array, and no error */
+/*          message related to LWORK is issued by XERBLA. */
+
+/*  INFO    (output) INTEGER */
+/*          = 0:  successful exit */
+/*          < 0:  if INFO = -i, the i-th argument had an illegal value */
+
+/*  Further Details */
+/*  =============== */
+
+/*  The matrix Q is represented as a product of elementary reflectors */
+
+/*     Q = H(1) H(2) . . . H(k), where k = min(m,n). */
+
+/*  Each H(i) has the form */
+
+/*     H(i) = I - tau * v * v' */
+
+/*  where tau is a real scalar, and v is a real vector with */
+/*  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), */
+/*  and tau in TAU(i). */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "DGEQRF", " ", m, n, &c_n1, &c_n1);
+    lwkopt = *n * nb;
+    work[1] = (doublereal) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQRF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < k) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DGEQRF", " ", m, n, &c_n1, &c_n1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*              Not enough workspace to use optimal NB:  reduce NB and */
+/*              determine the minimum value of NB. */
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DGEQRF", " ", m, n, &c_n1, &
+			c_n1);
+		nbmin = max(i__1,i__2);
+	    }
+	}
+    }
+
+    if (nb >= nbmin && nb < k && nx < k) {
+
+/*        Use blocked code initially */
+
+	i__1 = k - nx;
+	i__2 = nb;
+	for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__3 = k - i__ + 1;
+	    ib = min(i__3,nb);
+
+/*           Compute the QR factorization of the current block */
+/*           A(i:m,i:i+ib-1) */
+
+	    i__3 = *m - i__ + 1;
+	    dgeqr2_(&i__3, &ib, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[
+		    1], &iinfo);
+	    if (i__ + ib <= *n) {
+
+/*              Form the triangular factor of the block reflector */
+/*              H = H(i) H(i+1) . . . H(i+ib-1) */
+
+		i__3 = *m - i__ + 1;
+		dlarft_("Forward", "Columnwise", &i__3, &ib, &a[i__ + i__ * 
+			a_dim1], lda, &tau[i__], &work[1], &ldwork);
+
+/*              Apply H' to A(i:m,i+ib:n) from the left */
+
+		i__3 = *m - i__ + 1;
+		i__4 = *n - i__ - ib + 1;
+		dlarfb_("Left", "Transpose", "Forward", "Columnwise", &i__3, &
+			i__4, &ib, &a[i__ + i__ * a_dim1], lda, &work[1], &
+			ldwork, &a[i__ + (i__ + ib) * a_dim1], lda, &work[ib 
+			+ 1], &ldwork);
+	    }
+/* L10: */
+	}
+    } else {
+	i__ = 1;
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+
+    if (i__ <= k) {
+	i__2 = *m - i__ + 1;
+	i__1 = *n - i__ + 1;
+	dgeqr2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &tau[i__], &work[1]
+, &iinfo);
+    }
+
+    work[1] = (doublereal) iws;
+    return 0;
+
+/*     End of DGEQRF */
+
+} /* dgeqrf_ */

+ 194 - 0
min-dgels/additional/dger.c

@@ -0,0 +1,194 @@
+/* dger.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dger_(integer *m, integer *n, doublereal *alpha, 
+	doublereal *x, integer *incx, doublereal *y, integer *incy, 
+	doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    integer i__, j, ix, jy, kx, info;
+    doublereal temp;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DGER   performs the rank 1 operation */
+
+/*     A := alpha*x*y' + A, */
+
+/*  where alpha is a scalar, x is an m element vector, y is an n element */
+/*  vector and A is an m by n matrix. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  M      - INTEGER. */
+/*           On entry, M specifies the number of rows of the matrix A. */
+/*           M must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the number of columns of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry, ALPHA specifies the scalar alpha. */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( m - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the m */
+/*           element vector x. */
+/*           Unchanged on exit. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+/*  Y      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCY ) ). */
+/*           Before entry, the incremented array Y must contain the n */
+/*           element vector y. */
+/*           Unchanged on exit. */
+
+/*  INCY   - INTEGER. */
+/*           On entry, INCY specifies the increment for the elements of */
+/*           Y. INCY must not be zero. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
+/*           Before entry, the leading m by n part of the array A must */
+/*           contain the matrix of coefficients. On exit, A is */
+/*           overwritten by the updated matrix. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           max( 1, m ). */
+/*           Unchanged on exit. */
+
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    --x;
+    --y;
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    info = 0;
+    if (*m < 0) {
+	info = 1;
+    } else if (*n < 0) {
+	info = 2;
+    } else if (*incx == 0) {
+	info = 5;
+    } else if (*incy == 0) {
+	info = 7;
+    } else if (*lda < max(1,*m)) {
+	info = 9;
+    }
+    if (info != 0) {
+	xerbla_("DGER  ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0 || *alpha == 0.) {
+	return 0;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (*incy > 0) {
+	jy = 1;
+    } else {
+	jy = 1 - (*n - 1) * *incy;
+    }
+    if (*incx == 1) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (y[jy] != 0.) {
+		temp = *alpha * y[jy];
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    a[i__ + j * a_dim1] += x[i__] * temp;
+/* L10: */
+		}
+	    }
+	    jy += *incy;
+/* L20: */
+	}
+    } else {
+	if (*incx > 0) {
+	    kx = 1;
+	} else {
+	    kx = 1 - (*m - 1) * *incx;
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (y[jy] != 0.) {
+		temp = *alpha * y[jy];
+		ix = kx;
+		i__2 = *m;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    a[i__ + j * a_dim1] += x[ix] * temp;
+		    ix += *incx;
+/* L30: */
+		}
+	    }
+	    jy += *incy;
+/* L40: */
+	}
+    }
+
+    return 0;
+
+/*     End of DGER  . */
+
+} /* dger_ */

+ 52 - 0
min-dgels/additional/disnan.c

@@ -0,0 +1,52 @@
+/* disnan.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+logical disnan_(doublereal *din)
+{
+    /* System generated locals */
+    logical ret_val;
+
+    /* Local variables */
+    extern logical dlaisnan_(doublereal *, doublereal *);
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DISNAN returns .TRUE. if its argument is NaN, and .FALSE. */
+/*  otherwise.  To be replaced by the Fortran 2003 intrinsic in the */
+/*  future. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  DIN      (input) DOUBLE PRECISION */
+/*          Input to test for NaN. */
+
+/*  ===================================================================== */
+
+/*  .. External Functions .. */
+/*  .. */
+/*  .. Executable Statements .. */
+    ret_val = dlaisnan_(din, din);
+    return ret_val;
+} /* disnan_ */

+ 72 - 0
min-dgels/additional/dlabad.c

@@ -0,0 +1,72 @@
+/* dlabad.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dlabad_(doublereal *small, doublereal *large)
+{
+    /* Builtin functions */
+    double d_lg10(doublereal *), sqrt(doublereal);
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLABAD takes as input the values computed by DLAMCH for underflow and */
+/*  overflow, and returns the square root of each of these values if the */
+/*  log of LARGE is sufficiently large.  This subroutine is intended to */
+/*  identify machines with a large exponent range, such as the Crays, and */
+/*  redefine the underflow and overflow limits to be the square roots of */
+/*  the values computed by DLAMCH.  This subroutine is needed because */
+/*  DLAMCH does not compensate for poor arithmetic in the upper half of */
+/*  the exponent range, as is found on a Cray. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SMALL   (input/output) DOUBLE PRECISION */
+/*          On entry, the underflow threshold as computed by DLAMCH. */
+/*          On exit, if LOG10(LARGE) is sufficiently large, the square */
+/*          root of SMALL, otherwise unchanged. */
+
+/*  LARGE   (input/output) DOUBLE PRECISION */
+/*          On entry, the overflow threshold as computed by DLAMCH. */
+/*          On exit, if LOG10(LARGE) is sufficiently large, the square */
+/*          root of LARGE, otherwise unchanged. */
+
+/*  ===================================================================== */
+
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     If it looks like we're on a Cray, take the square root of */
+/*     SMALL and LARGE to avoid overflow and underflow problems. */
+
+    if (d_lg10(large) > 2e3) {
+	*small = sqrt(*small);
+	*large = sqrt(*large);
+    }
+
+    return 0;
+
+/*     End of DLABAD */
+
+} /* dlabad_ */

+ 58 - 0
min-dgels/additional/dlaisnan.c

@@ -0,0 +1,58 @@
+/* dlaisnan.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+logical dlaisnan_(doublereal *din1, doublereal *din2)
+{
+    /* System generated locals */
+    logical ret_val;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  This routine is not for general use.  It exists solely to avoid */
+/*  over-optimization in DISNAN. */
+
+/*  DLAISNAN checks for NaNs by comparing its two arguments for */
+/*  inequality.  NaN is the only floating-point value where NaN != NaN */
+/*  returns .TRUE.  To check for NaNs, pass the same variable as both */
+/*  arguments. */
+
+/*  A compiler must assume that the two arguments are */
+/*  not the same variable, and the test will not be optimized away. */
+/*  Interprocedural or whole-program optimization may delete this */
+/*  test.  The ISNAN functions will be replaced by the correct */
+/*  Fortran 03 intrinsic once the intrinsic is widely available. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  DIN1     (input) DOUBLE PRECISION */
+/*  DIN2     (input) DOUBLE PRECISION */
+/*          Two numbers to compare for inequality. */
+
+/*  ===================================================================== */
+
+/*  .. Executable Statements .. */
+    ret_val = *din1 != *din2;
+    return ret_val;
+} /* dlaisnan_ */

Разница между файлами не показана из-за своего большого размера
+ 1001 - 0
min-dgels/additional/dlamch.c


+ 199 - 0
min-dgels/additional/dlange.c

@@ -0,0 +1,199 @@
+/* dlange.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+doublereal dlange_(char *norm, integer *m, integer *n, doublereal *a, integer 
+	*lda, doublereal *work)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal ret_val, d__1, d__2, d__3;
+
+    /* Builtin functions */
+    double sqrt(doublereal);
+
+    /* Local variables */
+    integer i__, j;
+    doublereal sum, scale;
+    extern logical lsame_(char *, char *);
+    doublereal value;
+    extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *);
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLANGE  returns the value of the one norm,  or the Frobenius norm, or */
+/*  the  infinity norm,  or the  element of  largest absolute value  of a */
+/*  real matrix A. */
+
+/*  Description */
+/*  =========== */
+
+/*  DLANGE returns the value */
+
+/*     DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' */
+/*              ( */
+/*              ( norm1(A),         NORM = '1', 'O' or 'o' */
+/*              ( */
+/*              ( normI(A),         NORM = 'I' or 'i' */
+/*              ( */
+/*              ( normF(A),         NORM = 'F', 'f', 'E' or 'e' */
+
+/*  where  norm1  denotes the  one norm of a matrix (maximum column sum), */
+/*  normI  denotes the  infinity norm  of a matrix  (maximum row sum) and */
+/*  normF  denotes the  Frobenius norm of a matrix (square root of sum of */
+/*  squares).  Note that  max(abs(A(i,j)))  is not a consistent matrix norm. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  NORM    (input) CHARACTER*1 */
+/*          Specifies the value to be returned in DLANGE as described */
+/*          above. */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A.  M >= 0.  When M = 0, */
+/*          DLANGE is set to zero. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A.  N >= 0.  When N = 0, */
+/*          DLANGE is set to zero. */
+
+/*  A       (input) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          The m by n matrix A. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(M,1). */
+
+/*  WORK    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)), */
+/*          where LWORK >= M when NORM = 'I'; otherwise, WORK is not */
+/*          referenced. */
+
+/* ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --work;
+
+    /* Function Body */
+    if (min(*m,*n) == 0) {
+	value = 0.;
+    } else if (lsame_(norm, "M")) {
+
+/*        Find max(abs(A(i,j))). */
+
+	value = 0.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MAX */
+		d__2 = value, d__3 = (d__1 = a[i__ + j * a_dim1], abs(d__1));
+		value = max(d__2,d__3);
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (lsame_(norm, "O") || *(unsigned char *)
+	    norm == '1') {
+
+/*        Find norm1(A). */
+
+	value = 0.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    sum = 0.;
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		sum += (d__1 = a[i__ + j * a_dim1], abs(d__1));
+/* L30: */
+	    }
+	    value = max(value,sum);
+/* L40: */
+	}
+    } else if (lsame_(norm, "I")) {
+
+/*        Find normI(A). */
+
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[i__] = 0.;
+/* L50: */
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		work[i__] += (d__1 = a[i__ + j * a_dim1], abs(d__1));
+/* L60: */
+	    }
+/* L70: */
+	}
+	value = 0.;
+	i__1 = *m;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* Computing MAX */
+	    d__1 = value, d__2 = work[i__];
+	    value = max(d__1,d__2);
+/* L80: */
+	}
+    } else if (lsame_(norm, "F") || lsame_(norm, "E")) {
+
+/*        Find normF(A). */
+
+	scale = 0.;
+	sum = 1.;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    dlassq_(m, &a[j * a_dim1 + 1], &c__1, &scale, &sum);
+/* L90: */
+	}
+	value = scale * sqrt(sum);
+    }
+
+    ret_val = value;
+    return ret_val;
+
+/*     End of DLANGE */
+
+} /* dlange_ */

+ 73 - 0
min-dgels/additional/dlapy2.c

@@ -0,0 +1,73 @@
+/* dlapy2.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+doublereal dlapy2_(doublereal *x, doublereal *y)
+{
+    /* System generated locals */
+    doublereal ret_val, d__1;
+
+    /* Builtin functions */
+    double sqrt(doublereal);
+
+    /* Local variables */
+    doublereal w, z__, xabs, yabs;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLAPY2 returns sqrt(x**2+y**2), taking care not to cause unnecessary */
+/*  overflow. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  X       (input) DOUBLE PRECISION */
+/*  Y       (input) DOUBLE PRECISION */
+/*          X and Y specify the values x and y. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    xabs = abs(*x);
+    yabs = abs(*y);
+    w = max(xabs,yabs);
+    z__ = min(xabs,yabs);
+    if (z__ == 0.) {
+	ret_val = w;
+    } else {
+/* Computing 2nd power */
+	d__1 = z__ / w;
+	ret_val = w * sqrt(d__1 * d__1 + 1.);
+    }
+    return ret_val;
+
+/*     End of DLAPY2 */
+
+} /* dlapy2_ */

+ 193 - 0
min-dgels/additional/dlarf.c

@@ -0,0 +1,193 @@
+/* dlarf.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static doublereal c_b4 = 1.;
+static doublereal c_b5 = 0.;
+static integer c__1 = 1;
+
+/* Subroutine */ int dlarf_(char *side, integer *m, integer *n, doublereal *v, 
+	 integer *incv, doublereal *tau, doublereal *c__, integer *ldc, 
+	doublereal *work)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset;
+    doublereal d__1;
+
+    /* Local variables */
+    integer i__;
+    logical applyleft;
+    extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *, 
+	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *);
+    integer lastc, lastv;
+    extern integer iladlc_(integer *, integer *, doublereal *, integer *), 
+	    iladlr_(integer *, integer *, doublereal *, integer *);
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLARF applies a real elementary reflector H to a real m by n matrix */
+/*  C, from either the left or the right. H is represented in the form */
+
+/*        H = I - tau * v * v' */
+
+/*  where tau is a real scalar and v is a real vector. */
+
+/*  If tau = 0, then H is taken to be the unit matrix. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SIDE    (input) CHARACTER*1 */
+/*          = 'L': form  H * C */
+/*          = 'R': form  C * H */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix C. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix C. */
+
+/*  V       (input) DOUBLE PRECISION array, dimension */
+/*                     (1 + (M-1)*abs(INCV)) if SIDE = 'L' */
+/*                  or (1 + (N-1)*abs(INCV)) if SIDE = 'R' */
+/*          The vector v in the representation of H. V is not used if */
+/*          TAU = 0. */
+
+/*  INCV    (input) INTEGER */
+/*          The increment between elements of v. INCV <> 0. */
+
+/*  TAU     (input) DOUBLE PRECISION */
+/*          The value tau in the representation of H. */
+
+/*  C       (input/output) DOUBLE PRECISION array, dimension (LDC,N) */
+/*          On entry, the m by n matrix C. */
+/*          On exit, C is overwritten by the matrix H * C if SIDE = 'L', */
+/*          or C * H if SIDE = 'R'. */
+
+/*  LDC     (input) INTEGER */
+/*          The leading dimension of the array C. LDC >= max(1,M). */
+
+/*  WORK    (workspace) DOUBLE PRECISION array, dimension */
+/*                         (N) if SIDE = 'L' */
+/*                      or (M) if SIDE = 'R' */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    --v;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    applyleft = lsame_(side, "L");
+    lastv = 0;
+    lastc = 0;
+    if (*tau != 0.) {
+/*     Set up variables for scanning V.  LASTV begins pointing to the end */
+/*     of V. */
+	if (applyleft) {
+	    lastv = *m;
+	} else {
+	    lastv = *n;
+	}
+	if (*incv > 0) {
+	    i__ = (lastv - 1) * *incv + 1;
+	} else {
+	    i__ = 1;
+	}
+/*     Look for the last non-zero row in V. */
+	while(lastv > 0 && v[i__] == 0.) {
+	    --lastv;
+	    i__ -= *incv;
+	}
+	if (applyleft) {
+/*     Scan for the last non-zero column in C(1:lastv,:). */
+	    lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+	} else {
+/*     Scan for the last non-zero row in C(:,1:lastv). */
+	    lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+	}
+    }
+/*     Note that lastc.eq.0 renders the BLAS operations null; no special */
+/*     case is needed at this level. */
+    if (applyleft) {
+
+/*        Form  H * C */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastv,1:lastc)' * v(1:lastv,1) */
+
+	    dgemv_("Transpose", &lastv, &lastc, &c_b4, &c__[c_offset], ldc, &
+		    v[1], incv, &c_b5, &work[1], &c__1);
+
+/*           C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)' */
+
+	    d__1 = -(*tau);
+	    dger_(&lastv, &lastc, &d__1, &v[1], incv, &work[1], &c__1, &c__[
+		    c_offset], ldc);
+	}
+    } else {
+
+/*        Form  C * H */
+
+	if (lastv > 0) {
+
+/*           w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1) */
+
+	    dgemv_("No transpose", &lastc, &lastv, &c_b4, &c__[c_offset], ldc, 
+		     &v[1], incv, &c_b5, &work[1], &c__1);
+
+/*           C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)' */
+
+	    d__1 = -(*tau);
+	    dger_(&lastc, &lastv, &d__1, &work[1], &c__1, &v[1], incv, &c__[
+		    c_offset], ldc);
+	}
+    }
+    return 0;
+
+/*     End of DLARF */
+
+} /* dlarf_ */

+ 774 - 0
min-dgels/additional/dlarfb.c

@@ -0,0 +1,774 @@
+/* dlarfb.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static doublereal c_b14 = 1.;
+static doublereal c_b25 = -1.;
+
+/* Subroutine */ int dlarfb_(char *side, char *trans, char *direct, char *
+	storev, integer *m, integer *n, integer *k, doublereal *v, integer *
+	ldv, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, 
+	doublereal *work, integer *ldwork)
+{
+    /* System generated locals */
+    integer c_dim1, c_offset, t_dim1, t_offset, v_dim1, v_offset, work_dim1, 
+	    work_offset, i__1, i__2;
+
+    /* Local variables */
+    integer i__, j;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *);
+    extern logical lsame_(char *, char *);
+    integer lastc;
+    extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, 
+	    doublereal *, integer *), dtrmm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *);
+    integer lastv;
+    extern integer iladlc_(integer *, integer *, doublereal *, integer *), 
+	    iladlr_(integer *, integer *, doublereal *, integer *);
+    char transt[1];
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLARFB applies a real block reflector H or its transpose H' to a */
+/*  real m by n matrix C, from either the left or the right. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SIDE    (input) CHARACTER*1 */
+/*          = 'L': apply H or H' from the Left */
+/*          = 'R': apply H or H' from the Right */
+
+/*  TRANS   (input) CHARACTER*1 */
+/*          = 'N': apply H (No transpose) */
+/*          = 'T': apply H' (Transpose) */
+
+/*  DIRECT  (input) CHARACTER*1 */
+/*          Indicates how H is formed from a product of elementary */
+/*          reflectors */
+/*          = 'F': H = H(1) H(2) . . . H(k) (Forward) */
+/*          = 'B': H = H(k) . . . H(2) H(1) (Backward) */
+
+/*  STOREV  (input) CHARACTER*1 */
+/*          Indicates how the vectors which define the elementary */
+/*          reflectors are stored: */
+/*          = 'C': Columnwise */
+/*          = 'R': Rowwise */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix C. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix C. */
+
+/*  K       (input) INTEGER */
+/*          The order of the matrix T (= the number of elementary */
+/*          reflectors whose product defines the block reflector). */
+
+/*  V       (input) DOUBLE PRECISION array, dimension */
+/*                                (LDV,K) if STOREV = 'C' */
+/*                                (LDV,M) if STOREV = 'R' and SIDE = 'L' */
+/*                                (LDV,N) if STOREV = 'R' and SIDE = 'R' */
+/*          The matrix V. See further details. */
+
+/*  LDV     (input) INTEGER */
+/*          The leading dimension of the array V. */
+/*          If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); */
+/*          if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); */
+/*          if STOREV = 'R', LDV >= K. */
+
+/*  T       (input) DOUBLE PRECISION array, dimension (LDT,K) */
+/*          The triangular k by k matrix T in the representation of the */
+/*          block reflector. */
+
+/*  LDT     (input) INTEGER */
+/*          The leading dimension of the array T. LDT >= K. */
+
+/*  C       (input/output) DOUBLE PRECISION array, dimension (LDC,N) */
+/*          On entry, the m by n matrix C. */
+/*          On exit, C is overwritten by H*C or H'*C or C*H or C*H'. */
+
+/*  LDC     (input) INTEGER */
+/*          The leading dimension of the array C. LDA >= max(1,M). */
+
+/*  WORK    (workspace) DOUBLE PRECISION array, dimension (LDWORK,K) */
+
+/*  LDWORK  (input) INTEGER */
+/*          The leading dimension of the array WORK. */
+/*          If SIDE = 'L', LDWORK >= max(1,N); */
+/*          if SIDE = 'R', LDWORK >= max(1,M). */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    work_dim1 = *ldwork;
+    work_offset = 1 + work_dim1;
+    work -= work_offset;
+
+    /* Function Body */
+    if (*m <= 0 || *n <= 0) {
+	return 0;
+    }
+
+    if (lsame_(trans, "N")) {
+	*(unsigned char *)transt = 'T';
+    } else {
+	*(unsigned char *)transt = 'N';
+    }
+
+    if (lsame_(storev, "C")) {
+
+	if (lsame_(direct, "F")) {
+
+/*           Let  V =  ( V1 )    (first K rows) */
+/*                     ( V2 ) */
+/*           where  V1  is unit lower triangular. */
+
+	    if (lsame_(side, "L")) {
+
+/*              Form  H * C  or  H' * C  where  C = ( C1 ) */
+/*                                                  ( C2 ) */
+
+/* Computing MAX */
+		i__1 = *k, i__2 = iladlr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*              W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK) */
+
+/*              W := C1' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1 
+			    + 1], &c__1);
+/* L10: */
+		}
+
+/*              W := W * V1 */
+
+		dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b14, &c__[*k + 1 + c_dim1], ldc, &v[*k + 1 + 
+			    v_dim1], ldv, &c_b14, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		dtrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b14, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - V2 * W' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b25, &v[*k + 1 + v_dim1], ldv, &work[
+			    work_offset], ldwork, &c_b14, &c__[*k + 1 + 
+			    c_dim1], ldc);
+		}
+
+/*              W := W * V1' */
+
+		dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[j + i__ * c_dim1] -= work[i__ + j * work_dim1];
+/* L20: */
+		    }
+/* L30: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*              Form  C * H  or  C * H'  where  C = ( C1  C2 ) */
+
+/* Computing MAX */
+		i__1 = *k, i__2 = iladlr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK) */
+
+/*              W := C1 */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j * 
+			    work_dim1 + 1], &c__1);
+/* L40: */
+		}
+
+/*              W := W * V1 */
+
+		dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b14, &c__[(*k + 1) * c_dim1 + 1], ldc, &v[*k + 
+			    1 + v_dim1], ldv, &c_b14, &work[work_offset], 
+			    ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		dtrmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b14, 
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &lastc, &i__1, k, &
+			    c_b25, &work[work_offset], ldwork, &v[*k + 1 + 
+			    v_dim1], ldv, &c_b14, &c__[(*k + 1) * c_dim1 + 1], 
+			     ldc);
+		}
+
+/*              W := W * V1' */
+
+		dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] -= work[i__ + j * work_dim1];
+/* L50: */
+		    }
+/* L60: */
+		}
+	    }
+
+	} else {
+
+/*           Let  V =  ( V1 ) */
+/*                     ( V2 )    (last K rows) */
+/*           where  V2  is unit upper triangular. */
+
+	    if (lsame_(side, "L")) {
+
+/*              Form  H * C  or  H' * C  where  C = ( C1 ) */
+/*                                                  ( C2 ) */
+
+/* Computing MAX */
+		i__1 = *k, i__2 = iladlr_(m, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*              W := C' * V  =  (C1'*V1 + C2'*V2)  (stored in WORK) */
+
+/*              W := C2' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+/* L70: */
+		}
+
+/*              W := W * V2 */
+
+		dtrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b14, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b14, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b14, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		dtrmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b14, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1 * W' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &i__1, &lastc, k, &
+			    c_b25, &v[v_offset], ldv, &work[work_offset], 
+			    ldwork, &c_b14, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		dtrmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b14, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[lastv - *k + j + i__ * c_dim1] -= work[i__ + j * 
+				work_dim1];
+/* L80: */
+		    }
+/* L90: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*              Form  C * H  or  C * H'  where  C = ( C1  C2 ) */
+
+/* Computing MAX */
+		i__1 = *k, i__2 = iladlr_(n, k, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*              W := C * V  =  (C1*V1 + C2*V2)  (stored in WORK) */
+
+/*              W := C2 */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[(*n - *k + j) * c_dim1 + 1], &c__1, &
+			    work[j * work_dim1 + 1], &c__1);
+/* L100: */
+		}
+
+/*              W := W * V2 */
+
+		dtrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b14, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "No transpose", &lastc, k, &i__1, &
+			    c_b14, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b14, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		dtrmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b14, 
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &lastc, &i__1, k, &
+			    c_b25, &work[work_offset], ldwork, &v[v_offset], 
+			    ldv, &c_b14, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2' */
+
+		dtrmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b14, &v[lastv - *k + 1 + v_dim1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + (lastv - *k + j) * c_dim1] -= work[i__ + j *
+				 work_dim1];
+/* L110: */
+		    }
+/* L120: */
+		}
+	    }
+	}
+
+    } else if (lsame_(storev, "R")) {
+
+	if (lsame_(direct, "F")) {
+
+/*           Let  V =  ( V1  V2 )    (V1: first K columns) */
+/*           where  V1  is unit upper triangular. */
+
+	    if (lsame_(side, "L")) {
+
+/*              Form  H * C  or  H' * C  where  C = ( C1 ) */
+/*                                                  ( C2 ) */
+
+/* Computing MAX */
+		i__1 = *k, i__2 = iladlc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*              W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK) */
+
+/*              W := C1' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[j + c_dim1], ldc, &work[j * work_dim1 
+			    + 1], &c__1);
+/* L130: */
+		}
+
+/*              W := W * V1' */
+
+		dtrmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2'*V2' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "Transpose", &lastc, k, &i__1, &c_b14, 
+			     &c__[*k + 1 + c_dim1], ldc, &v[(*k + 1) * v_dim1 
+			    + 1], ldv, &c_b14, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		dtrmm_("Right", "Upper", transt, "Non-unit", &lastc, k, &
+			c_b14, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - V2' * W' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "Transpose", &i__1, &lastc, k, &c_b25, 
+			     &v[(*k + 1) * v_dim1 + 1], ldv, &work[
+			    work_offset], ldwork, &c_b14, &c__[*k + 1 + 
+			    c_dim1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		dtrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[j + i__ * c_dim1] -= work[i__ + j * work_dim1];
+/* L140: */
+		    }
+/* L150: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*              Form  C * H  or  C * H'  where  C = ( C1  C2 ) */
+
+/* Computing MAX */
+		i__1 = *k, i__2 = iladlc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*              W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK) */
+
+/*              W := C1 */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[j * c_dim1 + 1], &c__1, &work[j * 
+			    work_dim1 + 1], &c__1);
+/* L160: */
+		}
+
+/*              W := W * V1' */
+
+		dtrmm_("Right", "Upper", "Transpose", "Unit", &lastc, k, &
+			c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C2 * V2' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &lastc, k, &i__1, &
+			    c_b14, &c__[(*k + 1) * c_dim1 + 1], ldc, &v[(*k + 
+			    1) * v_dim1 + 1], ldv, &c_b14, &work[work_offset], 
+			     ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		dtrmm_("Right", "Upper", trans, "Non-unit", &lastc, k, &c_b14, 
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C2 := C2 - W * V2 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    c_b25, &work[work_offset], ldwork, &v[(*k + 1) * 
+			    v_dim1 + 1], ldv, &c_b14, &c__[(*k + 1) * c_dim1 
+			    + 1], ldc);
+		}
+
+/*              W := W * V1 */
+
+		dtrmm_("Right", "Upper", "No transpose", "Unit", &lastc, k, &
+			c_b14, &v[v_offset], ldv, &work[work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + j * c_dim1] -= work[i__ + j * work_dim1];
+/* L170: */
+		    }
+/* L180: */
+		}
+
+	    }
+
+	} else {
+
+/*           Let  V =  ( V1  V2 )    (V2: last K columns) */
+/*           where  V2  is unit lower triangular. */
+
+	    if (lsame_(side, "L")) {
+
+/*              Form  H * C  or  H' * C  where  C = ( C1 ) */
+/*                                                  ( C2 ) */
+
+/* Computing MAX */
+		i__1 = *k, i__2 = iladlc_(k, m, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlc_(&lastv, n, &c__[c_offset], ldc);
+
+/*              W := C' * V'  =  (C1'*V1' + C2'*V2') (stored in WORK) */
+
+/*              W := C2' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[lastv - *k + j + c_dim1], ldc, &work[
+			    j * work_dim1 + 1], &c__1);
+/* L190: */
+		}
+
+/*              W := W * V2' */
+
+		dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b14, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1'*V1' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "Transpose", &lastc, k, &i__1, &c_b14, 
+			     &c__[c_offset], ldc, &v[v_offset], ldv, &c_b14, &
+			    work[work_offset], ldwork);
+		}
+
+/*              W := W * T'  or  W * T */
+
+		dtrmm_("Right", "Lower", transt, "Non-unit", &lastc, k, &
+			c_b14, &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - V' * W' */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - V1' * W' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("Transpose", "Transpose", &i__1, &lastc, k, &c_b25, 
+			     &v[v_offset], ldv, &work[work_offset], ldwork, &
+			    c_b14, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b14, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C2 := C2 - W' */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[lastv - *k + j + i__ * c_dim1] -= work[i__ + j * 
+				work_dim1];
+/* L200: */
+		    }
+/* L210: */
+		}
+
+	    } else if (lsame_(side, "R")) {
+
+/*              Form  C * H  or  C * H'  where  C = ( C1  C2 ) */
+
+/* Computing MAX */
+		i__1 = *k, i__2 = iladlc_(k, n, &v[v_offset], ldv);
+		lastv = max(i__1,i__2);
+		lastc = iladlr_(m, &lastv, &c__[c_offset], ldc);
+
+/*              W := C * V'  =  (C1*V1' + C2*V2')  (stored in WORK) */
+
+/*              W := C2 */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    dcopy_(&lastc, &c__[(lastv - *k + j) * c_dim1 + 1], &c__1, 
+			     &work[j * work_dim1 + 1], &c__1);
+/* L220: */
+		}
+
+/*              W := W * V2' */
+
+		dtrmm_("Right", "Lower", "Transpose", "Unit", &lastc, k, &
+			c_b14, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+		if (lastv > *k) {
+
+/*                 W := W + C1 * V1' */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "Transpose", &lastc, k, &i__1, &
+			    c_b14, &c__[c_offset], ldc, &v[v_offset], ldv, &
+			    c_b14, &work[work_offset], ldwork);
+		}
+
+/*              W := W * T  or  W * T' */
+
+		dtrmm_("Right", "Lower", trans, "Non-unit", &lastc, k, &c_b14, 
+			 &t[t_offset], ldt, &work[work_offset], ldwork);
+
+/*              C := C - W * V */
+
+		if (lastv > *k) {
+
+/*                 C1 := C1 - W * V1 */
+
+		    i__1 = lastv - *k;
+		    dgemm_("No transpose", "No transpose", &lastc, &i__1, k, &
+			    c_b25, &work[work_offset], ldwork, &v[v_offset], 
+			    ldv, &c_b14, &c__[c_offset], ldc);
+		}
+
+/*              W := W * V2 */
+
+		dtrmm_("Right", "Lower", "No transpose", "Unit", &lastc, k, &
+			c_b14, &v[(lastv - *k + 1) * v_dim1 + 1], ldv, &work[
+			work_offset], ldwork);
+
+/*              C1 := C1 - W */
+
+		i__1 = *k;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = lastc;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			c__[i__ + (lastv - *k + j) * c_dim1] -= work[i__ + j *
+				 work_dim1];
+/* L230: */
+		    }
+/* L240: */
+		}
+
+	    }
+
+	}
+    }
+
+    return 0;
+
+/*     End of DLARFB */
+
+} /* dlarfb_ */

+ 170 - 0
min-dgels/additional/dlarfg.c

@@ -0,0 +1,170 @@
+/* dlarfg.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dlarfg_(integer *n, doublereal *alpha, doublereal *x, 
+	integer *incx, doublereal *tau)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1;
+
+    /* Builtin functions */
+    double d_sign(doublereal *, doublereal *);
+
+    /* Local variables */
+    integer j, knt;
+    doublereal beta;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, 
+	    integer *);
+    doublereal xnorm;
+    extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(char *);
+    doublereal safmin, rsafmn;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLARFG generates a real elementary reflector H of order n, such */
+/*  that */
+
+/*        H * ( alpha ) = ( beta ),   H' * H = I. */
+/*            (   x   )   (   0  ) */
+
+/*  where alpha and beta are scalars, and x is an (n-1)-element real */
+/*  vector. H is represented in the form */
+
+/*        H = I - tau * ( 1 ) * ( 1 v' ) , */
+/*                      ( v ) */
+
+/*  where tau is a real scalar and v is a real (n-1)-element */
+/*  vector. */
+
+/*  If the elements of x are all zero, then tau = 0 and H is taken to be */
+/*  the unit matrix. */
+
+/*  Otherwise  1 <= tau <= 2. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  N       (input) INTEGER */
+/*          The order of the elementary reflector. */
+
+/*  ALPHA   (input/output) DOUBLE PRECISION */
+/*          On entry, the value alpha. */
+/*          On exit, it is overwritten with the value beta. */
+
+/*  X       (input/output) DOUBLE PRECISION array, dimension */
+/*                         (1+(N-2)*abs(INCX)) */
+/*          On entry, the vector x. */
+/*          On exit, it is overwritten with the vector v. */
+
+/*  INCX    (input) INTEGER */
+/*          The increment between elements of X. INCX > 0. */
+
+/*  TAU     (output) DOUBLE PRECISION */
+/*          The value tau. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 1) {
+	*tau = 0.;
+	return 0;
+    }
+
+    i__1 = *n - 1;
+    xnorm = dnrm2_(&i__1, &x[1], incx);
+
+    if (xnorm == 0.) {
+
+/*        H  =  I */
+
+	*tau = 0.;
+    } else {
+
+/*        general case */
+
+	d__1 = dlapy2_(alpha, &xnorm);
+	beta = -d_sign(&d__1, alpha);
+	safmin = dlamch_("S") / dlamch_("E");
+	knt = 0;
+	if (abs(beta) < safmin) {
+
+/*           XNORM, BETA may be inaccurate; scale X and recompute them */
+
+	    rsafmn = 1. / safmin;
+L10:
+	    ++knt;
+	    i__1 = *n - 1;
+	    dscal_(&i__1, &rsafmn, &x[1], incx);
+	    beta *= rsafmn;
+	    *alpha *= rsafmn;
+	    if (abs(beta) < safmin) {
+		goto L10;
+	    }
+
+/*           New BETA is at most 1, at least SAFMIN */
+
+	    i__1 = *n - 1;
+	    xnorm = dnrm2_(&i__1, &x[1], incx);
+	    d__1 = dlapy2_(alpha, &xnorm);
+	    beta = -d_sign(&d__1, alpha);
+	}
+	*tau = (beta - *alpha) / beta;
+	i__1 = *n - 1;
+	d__1 = 1. / (*alpha - beta);
+	dscal_(&i__1, &d__1, &x[1], incx);
+
+/*        If ALPHA is subnormal, it may lose relative accuracy */
+
+	i__1 = knt;
+	for (j = 1; j <= i__1; ++j) {
+	    beta *= safmin;
+/* L20: */
+	}
+	*alpha = beta;
+    }
+
+    return 0;
+
+/*     End of DLARFG */
+
+} /* dlarfg_ */

+ 192 - 0
min-dgels/additional/dlarfp.c

@@ -0,0 +1,192 @@
+/* dlarfp.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dlarfp_(integer *n, doublereal *alpha, doublereal *x, 
+	integer *incx, doublereal *tau)
+{
+    /* System generated locals */
+    integer i__1;
+    doublereal d__1;
+
+    /* Builtin functions */
+    double d_sign(doublereal *, doublereal *);
+
+    /* Local variables */
+    integer j, knt;
+    doublereal beta;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, 
+	    integer *);
+    doublereal xnorm;
+    extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(char *);
+    doublereal safmin, rsafmn;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLARFP generates a real elementary reflector H of order n, such */
+/*  that */
+
+/*        H * ( alpha ) = ( beta ),   H' * H = I. */
+/*            (   x   )   (   0  ) */
+
+/*  where alpha and beta are scalars, beta is non-negative, and x is */
+/*  an (n-1)-element real vector.  H is represented in the form */
+
+/*        H = I - tau * ( 1 ) * ( 1 v' ) , */
+/*                      ( v ) */
+
+/*  where tau is a real scalar and v is a real (n-1)-element */
+/*  vector. */
+
+/*  If the elements of x are all zero, then tau = 0 and H is taken to be */
+/*  the unit matrix. */
+
+/*  Otherwise  1 <= tau <= 2. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  N       (input) INTEGER */
+/*          The order of the elementary reflector. */
+
+/*  ALPHA   (input/output) DOUBLE PRECISION */
+/*          On entry, the value alpha. */
+/*          On exit, it is overwritten with the value beta. */
+
+/*  X       (input/output) DOUBLE PRECISION array, dimension */
+/*                         (1+(N-2)*abs(INCX)) */
+/*          On entry, the vector x. */
+/*          On exit, it is overwritten with the vector v. */
+
+/*  INCX    (input) INTEGER */
+/*          The increment between elements of X. INCX > 0. */
+
+/*  TAU     (output) DOUBLE PRECISION */
+/*          The value tau. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	*tau = 0.;
+	return 0;
+    }
+
+    i__1 = *n - 1;
+    xnorm = dnrm2_(&i__1, &x[1], incx);
+
+    if (xnorm == 0.) {
+
+/*        H  =  [+/-1, 0; I], sign chosen so ALPHA >= 0 */
+
+	if (*alpha >= 0.) {
+/*           When TAU.eq.ZERO, the vector is special-cased to be */
+/*           all zeros in the application routines.  We do not need */
+/*           to clear it. */
+	    *tau = 0.;
+	} else {
+/*           However, the application routines rely on explicit */
+/*           zero checks when TAU.ne.ZERO, and we must clear X. */
+	    *tau = 2.;
+	    i__1 = *n - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		x[(j - 1) * *incx + 1] = 0.;
+	    }
+	    *alpha = -(*alpha);
+	}
+    } else {
+
+/*        general case */
+
+	d__1 = dlapy2_(alpha, &xnorm);
+	beta = d_sign(&d__1, alpha);
+	safmin = dlamch_("S") / dlamch_("E");
+	knt = 0;
+	if (abs(beta) < safmin) {
+
+/*           XNORM, BETA may be inaccurate; scale X and recompute them */
+
+	    rsafmn = 1. / safmin;
+L10:
+	    ++knt;
+	    i__1 = *n - 1;
+	    dscal_(&i__1, &rsafmn, &x[1], incx);
+	    beta *= rsafmn;
+	    *alpha *= rsafmn;
+	    if (abs(beta) < safmin) {
+		goto L10;
+	    }
+
+/*           New BETA is at most 1, at least SAFMIN */
+
+	    i__1 = *n - 1;
+	    xnorm = dnrm2_(&i__1, &x[1], incx);
+	    d__1 = dlapy2_(alpha, &xnorm);
+	    beta = d_sign(&d__1, alpha);
+	}
+	*alpha += beta;
+	if (beta < 0.) {
+	    beta = -beta;
+	    *tau = -(*alpha) / beta;
+	} else {
+	    *alpha = xnorm * (xnorm / *alpha);
+	    *tau = *alpha / beta;
+	    *alpha = -(*alpha);
+	}
+	i__1 = *n - 1;
+	d__1 = 1. / *alpha;
+	dscal_(&i__1, &d__1, &x[1], incx);
+
+/*        If BETA is subnormal, it may lose relative accuracy */
+
+	i__1 = knt;
+	for (j = 1; j <= i__1; ++j) {
+	    beta *= safmin;
+/* L20: */
+	}
+	*alpha = beta;
+    }
+
+    return 0;
+
+/*     End of DLARFP */
+
+} /* dlarfp_ */

+ 325 - 0
min-dgels/additional/dlarft.c

@@ -0,0 +1,325 @@
+/* dlarft.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static doublereal c_b8 = 0.;
+
+/* Subroutine */ int dlarft_(char *direct, char *storev, integer *n, integer *
+	k, doublereal *v, integer *ldv, doublereal *tau, doublereal *t, 
+	integer *ldt)
+{
+    /* System generated locals */
+    integer t_dim1, t_offset, v_dim1, v_offset, i__1, i__2, i__3;
+    doublereal d__1;
+
+    /* Local variables */
+    integer i__, j, prevlastv;
+    doublereal vii;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dgemv_(char *, integer *, integer *, 
+	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *);
+    integer lastv;
+    extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *);
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLARFT forms the triangular factor T of a real block reflector H */
+/*  of order n, which is defined as a product of k elementary reflectors. */
+
+/*  If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; */
+
+/*  If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. */
+
+/*  If STOREV = 'C', the vector which defines the elementary reflector */
+/*  H(i) is stored in the i-th column of the array V, and */
+
+/*     H  =  I - V * T * V' */
+
+/*  If STOREV = 'R', the vector which defines the elementary reflector */
+/*  H(i) is stored in the i-th row of the array V, and */
+
+/*     H  =  I - V' * T * V */
+
+/*  Arguments */
+/*  ========= */
+
+/*  DIRECT  (input) CHARACTER*1 */
+/*          Specifies the order in which the elementary reflectors are */
+/*          multiplied to form the block reflector: */
+/*          = 'F': H = H(1) H(2) . . . H(k) (Forward) */
+/*          = 'B': H = H(k) . . . H(2) H(1) (Backward) */
+
+/*  STOREV  (input) CHARACTER*1 */
+/*          Specifies how the vectors which define the elementary */
+/*          reflectors are stored (see also Further Details): */
+/*          = 'C': columnwise */
+/*          = 'R': rowwise */
+
+/*  N       (input) INTEGER */
+/*          The order of the block reflector H. N >= 0. */
+
+/*  K       (input) INTEGER */
+/*          The order of the triangular factor T (= the number of */
+/*          elementary reflectors). K >= 1. */
+
+/*  V       (input/output) DOUBLE PRECISION array, dimension */
+/*                               (LDV,K) if STOREV = 'C' */
+/*                               (LDV,N) if STOREV = 'R' */
+/*          The matrix V. See further details. */
+
+/*  LDV     (input) INTEGER */
+/*          The leading dimension of the array V. */
+/*          If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K. */
+
+/*  TAU     (input) DOUBLE PRECISION array, dimension (K) */
+/*          TAU(i) must contain the scalar factor of the elementary */
+/*          reflector H(i). */
+
+/*  T       (output) DOUBLE PRECISION array, dimension (LDT,K) */
+/*          The k by k triangular factor T of the block reflector. */
+/*          If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is */
+/*          lower triangular. The rest of the array is not used. */
+
+/*  LDT     (input) INTEGER */
+/*          The leading dimension of the array T. LDT >= K. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  The shape of the matrix V and the storage of the vectors which define */
+/*  the H(i) is best illustrated by the following example with n = 5 and */
+/*  k = 3. The elements equal to 1 are not stored; the corresponding */
+/*  array elements are modified but restored on exit. The rest of the */
+/*  array is not used. */
+
+/*  DIRECT = 'F' and STOREV = 'C':         DIRECT = 'F' and STOREV = 'R': */
+
+/*               V = (  1       )                 V = (  1 v1 v1 v1 v1 ) */
+/*                   ( v1  1    )                     (     1 v2 v2 v2 ) */
+/*                   ( v1 v2  1 )                     (        1 v3 v3 ) */
+/*                   ( v1 v2 v3 ) */
+/*                   ( v1 v2 v3 ) */
+
+/*  DIRECT = 'B' and STOREV = 'C':         DIRECT = 'B' and STOREV = 'R': */
+
+/*               V = ( v1 v2 v3 )                 V = ( v1 v1  1       ) */
+/*                   ( v1 v2 v3 )                     ( v2 v2 v2  1    ) */
+/*                   (  1 v2 v3 )                     ( v3 v3 v3 v3  1 ) */
+/*                   (     1 v3 ) */
+/*                   (        1 ) */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1;
+    v -= v_offset;
+    --tau;
+    t_dim1 = *ldt;
+    t_offset = 1 + t_dim1;
+    t -= t_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	return 0;
+    }
+
+    if (lsame_(direct, "F")) {
+	prevlastv = *n;
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    prevlastv = max(i__,prevlastv);
+	    if (tau[i__] == 0.) {
+
+/*              H(i)  =  I */
+
+		i__2 = i__;
+		for (j = 1; j <= i__2; ++j) {
+		    t[j + i__ * t_dim1] = 0.;
+/* L10: */
+		}
+	    } else {
+
+/*              general case */
+
+		vii = v[i__ + i__ * v_dim1];
+		v[i__ + i__ * v_dim1] = 1.;
+		if (lsame_(storev, "C")) {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			if (v[lastv + i__ * v_dim1] != 0.) {
+			    break;
+			}
+		    }
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)' * V(i:j,i) */
+
+		    i__2 = j - i__ + 1;
+		    i__3 = i__ - 1;
+		    d__1 = -tau[i__];
+		    dgemv_("Transpose", &i__2, &i__3, &d__1, &v[i__ + v_dim1], 
+			     ldv, &v[i__ + i__ * v_dim1], &c__1, &c_b8, &t[
+			    i__ * t_dim1 + 1], &c__1);
+		} else {
+/*                 Skip any trailing zeros. */
+		    i__2 = i__ + 1;
+		    for (lastv = *n; lastv >= i__2; --lastv) {
+			if (v[i__ + lastv * v_dim1] != 0.) {
+			    break;
+			}
+		    }
+		    j = min(lastv,prevlastv);
+
+/*                 T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)' */
+
+		    i__2 = i__ - 1;
+		    i__3 = j - i__ + 1;
+		    d__1 = -tau[i__];
+		    dgemv_("No transpose", &i__2, &i__3, &d__1, &v[i__ * 
+			    v_dim1 + 1], ldv, &v[i__ + i__ * v_dim1], ldv, &
+			    c_b8, &t[i__ * t_dim1 + 1], &c__1);
+		}
+		v[i__ + i__ * v_dim1] = vii;
+
+/*              T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) */
+
+		i__2 = i__ - 1;
+		dtrmv_("Upper", "No transpose", "Non-unit", &i__2, &t[
+			t_offset], ldt, &t[i__ * t_dim1 + 1], &c__1);
+		t[i__ + i__ * t_dim1] = tau[i__];
+		if (i__ > 1) {
+		    prevlastv = max(prevlastv,lastv);
+		} else {
+		    prevlastv = lastv;
+		}
+	    }
+/* L20: */
+	}
+    } else {
+	prevlastv = 1;
+	for (i__ = *k; i__ >= 1; --i__) {
+	    if (tau[i__] == 0.) {
+
+/*              H(i)  =  I */
+
+		i__1 = *k;
+		for (j = i__; j <= i__1; ++j) {
+		    t[j + i__ * t_dim1] = 0.;
+/* L30: */
+		}
+	    } else {
+
+/*              general case */
+
+		if (i__ < *k) {
+		    if (lsame_(storev, "C")) {
+			vii = v[*n - *k + i__ + i__ * v_dim1];
+			v[*n - *k + i__ + i__ * v_dim1] = 1.;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    if (v[lastv + i__ * v_dim1] != 0.) {
+				break;
+			    }
+			}
+			j = max(lastv,prevlastv);
+
+/*                    T(i+1:k,i) := */
+/*                            - tau(i) * V(j:n-k+i,i+1:k)' * V(j:n-k+i,i) */
+
+			i__1 = *n - *k + i__ - j + 1;
+			i__2 = *k - i__;
+			d__1 = -tau[i__];
+			dgemv_("Transpose", &i__1, &i__2, &d__1, &v[j + (i__ 
+				+ 1) * v_dim1], ldv, &v[j + i__ * v_dim1], &
+				c__1, &c_b8, &t[i__ + 1 + i__ * t_dim1], &
+				c__1);
+			v[*n - *k + i__ + i__ * v_dim1] = vii;
+		    } else {
+			vii = v[i__ + (*n - *k + i__) * v_dim1];
+			v[i__ + (*n - *k + i__) * v_dim1] = 1.;
+/*                    Skip any leading zeros. */
+			i__1 = i__ - 1;
+			for (lastv = 1; lastv <= i__1; ++lastv) {
+			    if (v[i__ + lastv * v_dim1] != 0.) {
+				break;
+			    }
+			}
+			j = max(lastv,prevlastv);
+
+/*                    T(i+1:k,i) := */
+/*                            - tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)' */
+
+			i__1 = *k - i__;
+			i__2 = *n - *k + i__ - j + 1;
+			d__1 = -tau[i__];
+			dgemv_("No transpose", &i__1, &i__2, &d__1, &v[i__ + 
+				1 + j * v_dim1], ldv, &v[i__ + j * v_dim1], 
+				ldv, &c_b8, &t[i__ + 1 + i__ * t_dim1], &c__1);
+			v[i__ + (*n - *k + i__) * v_dim1] = vii;
+		    }
+
+/*                 T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) */
+
+		    i__1 = *k - i__;
+		    dtrmv_("Lower", "No transpose", "Non-unit", &i__1, &t[i__ 
+			    + 1 + (i__ + 1) * t_dim1], ldt, &t[i__ + 1 + i__ *
+			     t_dim1], &c__1)
+			    ;
+		    if (i__ > 1) {
+			prevlastv = min(prevlastv,lastv);
+		    } else {
+			prevlastv = lastv;
+		    }
+		}
+		t[i__ + i__ * t_dim1] = tau[i__];
+	    }
+/* L40: */
+	}
+    }
+    return 0;
+
+/*     End of DLARFT */
+
+} /* dlarft_ */

+ 354 - 0
min-dgels/additional/dlascl.c

@@ -0,0 +1,354 @@
+/* dlascl.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dlascl_(char *type__, integer *kl, integer *ku, 
+	doublereal *cfrom, doublereal *cto, integer *m, integer *n, 
+	doublereal *a, integer *lda, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
+
+    /* Local variables */
+    integer i__, j, k1, k2, k3, k4;
+    doublereal mul, cto1;
+    logical done;
+    doublereal ctoc;
+    extern logical lsame_(char *, char *);
+    integer itype;
+    doublereal cfrom1;
+    extern doublereal dlamch_(char *);
+    doublereal cfromc;
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    doublereal bignum, smlnum;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLASCL multiplies the M by N real matrix A by the real scalar */
+/*  CTO/CFROM.  This is done without over/underflow as long as the final */
+/*  result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that */
+/*  A may be full, upper triangular, lower triangular, upper Hessenberg, */
+/*  or banded. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  TYPE    (input) CHARACTER*1 */
+/*          TYPE indices the storage type of the input matrix. */
+/*          = 'G':  A is a full matrix. */
+/*          = 'L':  A is a lower triangular matrix. */
+/*          = 'U':  A is an upper triangular matrix. */
+/*          = 'H':  A is an upper Hessenberg matrix. */
+/*          = 'B':  A is a symmetric band matrix with lower bandwidth KL */
+/*                  and upper bandwidth KU and with the only the lower */
+/*                  half stored. */
+/*          = 'Q':  A is a symmetric band matrix with lower bandwidth KL */
+/*                  and upper bandwidth KU and with the only the upper */
+/*                  half stored. */
+/*          = 'Z':  A is a band matrix with lower bandwidth KL and upper */
+/*                  bandwidth KU. */
+
+/*  KL      (input) INTEGER */
+/*          The lower bandwidth of A.  Referenced only if TYPE = 'B', */
+/*          'Q' or 'Z'. */
+
+/*  KU      (input) INTEGER */
+/*          The upper bandwidth of A.  Referenced only if TYPE = 'B', */
+/*          'Q' or 'Z'. */
+
+/*  CFROM   (input) DOUBLE PRECISION */
+/*  CTO     (input) DOUBLE PRECISION */
+/*          The matrix A is multiplied by CTO/CFROM. A(I,J) is computed */
+/*          without over/underflow if the final result CTO*A(I,J)/CFROM */
+/*          can be represented without over/underflow.  CFROM must be */
+/*          nonzero. */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A.  M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A.  N >= 0. */
+
+/*  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          The matrix to be multiplied by CTO/CFROM.  See TYPE for the */
+/*          storage type. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(1,M). */
+
+/*  INFO    (output) INTEGER */
+/*          0  - successful exit */
+/*          <0 - if INFO = -i, the i-th argument had an illegal value. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    *info = 0;
+
+    if (lsame_(type__, "G")) {
+	itype = 0;
+    } else if (lsame_(type__, "L")) {
+	itype = 1;
+    } else if (lsame_(type__, "U")) {
+	itype = 2;
+    } else if (lsame_(type__, "H")) {
+	itype = 3;
+    } else if (lsame_(type__, "B")) {
+	itype = 4;
+    } else if (lsame_(type__, "Q")) {
+	itype = 5;
+    } else if (lsame_(type__, "Z")) {
+	itype = 6;
+    } else {
+	itype = -1;
+    }
+
+    if (itype == -1) {
+	*info = -1;
+    } else if (*cfrom == 0. || disnan_(cfrom)) {
+	*info = -4;
+    } else if (disnan_(cto)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || itype == 4 && *n != *m || itype == 5 && *n != *m) {
+	*info = -7;
+    } else if (itype <= 3 && *lda < max(1,*m)) {
+	*info = -9;
+    } else if (itype >= 4) {
+/* Computing MAX */
+	i__1 = *m - 1;
+	if (*kl < 0 || *kl > max(i__1,0)) {
+	    *info = -2;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = *n - 1;
+	    if (*ku < 0 || *ku > max(i__1,0) || (itype == 4 || itype == 5) && 
+		    *kl != *ku) {
+		*info = -3;
+	    } else if (itype == 4 && *lda < *kl + 1 || itype == 5 && *lda < *
+		    ku + 1 || itype == 6 && *lda < (*kl << 1) + *ku + 1) {
+		*info = -9;
+	    }
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLASCL", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = dlamch_("S");
+    bignum = 1. / smlnum;
+
+    cfromc = *cfrom;
+    ctoc = *cto;
+
+L10:
+    cfrom1 = cfromc * smlnum;
+    if (cfrom1 == cfromc) {
+/*        CFROMC is an inf.  Multiply by a correctly signed zero for */
+/*        finite CTOC, or a NaN if CTOC is infinite. */
+	mul = ctoc / cfromc;
+	done = TRUE_;
+	cto1 = ctoc;
+    } else {
+	cto1 = ctoc / bignum;
+	if (cto1 == ctoc) {
+/*           CTOC is either 0 or an inf.  In both cases, CTOC itself */
+/*           serves as the correct multiplication factor. */
+	    mul = ctoc;
+	    done = TRUE_;
+	    cfromc = 1.;
+	} else if (abs(cfrom1) > abs(ctoc) && ctoc != 0.) {
+	    mul = smlnum;
+	    done = FALSE_;
+	    cfromc = cfrom1;
+	} else if (abs(cto1) > abs(cfromc)) {
+	    mul = bignum;
+	    done = FALSE_;
+	    ctoc = cto1;
+	} else {
+	    mul = ctoc / cfromc;
+	    done = TRUE_;
+	}
+    }
+
+    if (itype == 0) {
+
+/*        Full matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L20: */
+	    }
+/* L30: */
+	}
+
+    } else if (itype == 1) {
+
+/*        Lower triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L40: */
+	    }
+/* L50: */
+	}
+
+    } else if (itype == 2) {
+
+/*        Upper triangular matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = min(j,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L60: */
+	    }
+/* L70: */
+	}
+
+    } else if (itype == 3) {
+
+/*        Upper Hessenberg matrix */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j + 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L80: */
+	    }
+/* L90: */
+	}
+
+    } else if (itype == 4) {
+
+/*        Lower half of a symmetric band matrix */
+
+	k3 = *kl + 1;
+	k4 = *n + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = k3, i__4 = k4 - j;
+	    i__2 = min(i__3,i__4);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L100: */
+	    }
+/* L110: */
+	}
+
+    } else if (itype == 5) {
+
+/*        Upper half of a symmetric band matrix */
+
+	k1 = *ku + 2;
+	k3 = *ku + 1;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__2 = k1 - j;
+	    i__3 = k3;
+	    for (i__ = max(i__2,1); i__ <= i__3; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L120: */
+	    }
+/* L130: */
+	}
+
+    } else if (itype == 6) {
+
+/*        Band matrix */
+
+	k1 = *kl + *ku + 2;
+	k2 = *kl + 1;
+	k3 = (*kl << 1) + *ku + 1;
+	k4 = *kl + *ku + 1 + *m;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+/* Computing MAX */
+	    i__3 = k1 - j;
+/* Computing MIN */
+	    i__4 = k3, i__5 = k4 - j;
+	    i__2 = min(i__4,i__5);
+	    for (i__ = max(i__3,k2); i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] *= mul;
+/* L140: */
+	    }
+/* L150: */
+	}
+
+    }
+
+    if (! done) {
+	goto L10;
+    }
+
+    return 0;
+
+/*     End of DLASCL */
+
+} /* dlascl_ */

+ 152 - 0
min-dgels/additional/dlaset.c

@@ -0,0 +1,152 @@
+/* dlaset.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dlaset_(char *uplo, integer *m, integer *n, doublereal *
+	alpha, doublereal *beta, doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    integer i__, j;
+    extern logical lsame_(char *, char *);
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLASET initializes an m-by-n matrix A to BETA on the diagonal and */
+/*  ALPHA on the offdiagonals. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  UPLO    (input) CHARACTER*1 */
+/*          Specifies the part of the matrix A to be set. */
+/*          = 'U':      Upper triangular part is set; the strictly lower */
+/*                      triangular part of A is not changed. */
+/*          = 'L':      Lower triangular part is set; the strictly upper */
+/*                      triangular part of A is not changed. */
+/*          Otherwise:  All of the matrix A is set. */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A.  M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A.  N >= 0. */
+
+/*  ALPHA   (input) DOUBLE PRECISION */
+/*          The constant to which the offdiagonal elements are to be set. */
+
+/*  BETA    (input) DOUBLE PRECISION */
+/*          The constant to which the diagonal elements are to be set. */
+
+/*  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          On exit, the leading m-by-n submatrix of A is set as follows: */
+
+/*          if UPLO = 'U', A(i,j) = ALPHA, 1<=i<=j-1, 1<=j<=n, */
+/*          if UPLO = 'L', A(i,j) = ALPHA, j+1<=i<=m, 1<=j<=n, */
+/*          otherwise,     A(i,j) = ALPHA, 1<=i<=m, 1<=j<=n, i.ne.j, */
+
+/*          and, for all UPLO, A(i,i) = BETA, 1<=i<=min(m,n). */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(1,M). */
+
+/* ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (lsame_(uplo, "U")) {
+
+/*        Set the strictly upper triangular or trapezoidal part of the */
+/*        array to ALPHA. */
+
+	i__1 = *n;
+	for (j = 2; j <= i__1; ++j) {
+/* Computing MIN */
+	    i__3 = j - 1;
+	    i__2 = min(i__3,*m);
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L10: */
+	    }
+/* L20: */
+	}
+
+    } else if (lsame_(uplo, "L")) {
+
+/*        Set the strictly lower triangular or trapezoidal part of the */
+/*        array to ALPHA. */
+
+	i__1 = min(*m,*n);
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = j + 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L30: */
+	    }
+/* L40: */
+	}
+
+    } else {
+
+/*        Set the leading m-by-n submatrix to ALPHA. */
+
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		a[i__ + j * a_dim1] = *alpha;
+/* L50: */
+	    }
+/* L60: */
+	}
+    }
+
+/*     Set the first min(M,N) diagonal elements to BETA. */
+
+    i__1 = min(*m,*n);
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	a[i__ + i__ * a_dim1] = *beta;
+/* L70: */
+    }
+
+    return 0;
+
+/*     End of DLASET */
+
+} /* dlaset_ */

+ 116 - 0
min-dgels/additional/dlassq.c

@@ -0,0 +1,116 @@
+/* dlassq.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dlassq_(integer *n, doublereal *x, integer *incx, 
+	doublereal *scale, doublereal *sumsq)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublereal d__1;
+
+    /* Local variables */
+    integer ix;
+    doublereal absxi;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DLASSQ  returns the values  scl  and  smsq  such that */
+
+/*     ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, */
+
+/*  where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is */
+/*  assumed to be non-negative and  scl  returns the value */
+
+/*     scl = max( scale, abs( x( i ) ) ). */
+
+/*  scale and sumsq must be supplied in SCALE and SUMSQ and */
+/*  scl and smsq are overwritten on SCALE and SUMSQ respectively. */
+
+/*  The routine makes only one pass through the vector x. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  N       (input) INTEGER */
+/*          The number of elements to be used from the vector X. */
+
+/*  X       (input) DOUBLE PRECISION array, dimension (N) */
+/*          The vector for which a scaled sum of squares is computed. */
+/*             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. */
+
+/*  INCX    (input) INTEGER */
+/*          The increment between successive values of the vector X. */
+/*          INCX > 0. */
+
+/*  SCALE   (input/output) DOUBLE PRECISION */
+/*          On entry, the value  scale  in the equation above. */
+/*          On exit, SCALE is overwritten with  scl , the scaling factor */
+/*          for the sum of squares. */
+
+/*  SUMSQ   (input/output) DOUBLE PRECISION */
+/*          On entry, the value  sumsq  in the equation above. */
+/*          On exit, SUMSQ is overwritten with  smsq , the basic sum of */
+/*          squares from which  scl  has been factored out. */
+
+/* ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n > 0) {
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    if (x[ix] != 0.) {
+		absxi = (d__1 = x[ix], abs(d__1));
+		if (*scale < absxi) {
+/* Computing 2nd power */
+		    d__1 = *scale / absxi;
+		    *sumsq = *sumsq * (d__1 * d__1) + 1;
+		    *scale = absxi;
+		} else {
+/* Computing 2nd power */
+		    d__1 = absxi / *scale;
+		    *sumsq += d__1 * d__1;
+		}
+	    }
+/* L10: */
+	}
+    }
+    return 0;
+
+/*     End of DLASSQ */
+
+} /* dlassq_ */

+ 95 - 0
min-dgels/additional/dnrm2.c

@@ -0,0 +1,95 @@
+/* dnrm2.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+doublereal dnrm2_(integer *n, doublereal *x, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublereal ret_val, d__1;
+
+    /* Builtin functions */
+    double sqrt(doublereal);
+
+    /* Local variables */
+    integer ix;
+    doublereal ssq, norm, scale, absxi;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DNRM2 returns the euclidean norm of a vector via the function */
+/*  name, so that */
+
+/*     DNRM2 := sqrt( x'*x ) */
+
+
+/*  -- This version written on 25-October-1982. */
+/*     Modified on 14-October-1993 to inline the call to DLASSQ. */
+/*     Sven Hammarling, Nag Ltd. */
+
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n < 1 || *incx < 1) {
+	norm = 0.;
+    } else if (*n == 1) {
+	norm = abs(x[1]);
+    } else {
+	scale = 0.;
+	ssq = 1.;
+/*        The following loop is equivalent to this call to the LAPACK */
+/*        auxiliary routine: */
+/*        CALL DLASSQ( N, X, INCX, SCALE, SSQ ) */
+
+	i__1 = (*n - 1) * *incx + 1;
+	i__2 = *incx;
+	for (ix = 1; i__2 < 0 ? ix >= i__1 : ix <= i__1; ix += i__2) {
+	    if (x[ix] != 0.) {
+		absxi = (d__1 = x[ix], abs(d__1));
+		if (scale < absxi) {
+/* Computing 2nd power */
+		    d__1 = scale / absxi;
+		    ssq = ssq * (d__1 * d__1) + 1.;
+		    scale = absxi;
+		} else {
+/* Computing 2nd power */
+		    d__1 = absxi / scale;
+		    ssq += d__1 * d__1;
+		}
+	    }
+/* L10: */
+	}
+	norm = scale * sqrt(ssq);
+    }
+
+    ret_val = norm;
+    return ret_val;
+
+/*     End of DNRM2. */
+
+} /* dnrm2_ */

+ 235 - 0
min-dgels/additional/dorm2r.c

@@ -0,0 +1,235 @@
+/* dorm2r.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int dorm2r_(char *side, char *trans, integer *m, integer *n, 
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2;
+
+    /* Local variables */
+    integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    doublereal aii;
+    logical left;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    logical notran;
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DORM2R overwrites the general real m by n matrix C with */
+
+/*        Q * C  if SIDE = 'L' and TRANS = 'N', or */
+
+/*        Q'* C  if SIDE = 'L' and TRANS = 'T', or */
+
+/*        C * Q  if SIDE = 'R' and TRANS = 'N', or */
+
+/*        C * Q' if SIDE = 'R' and TRANS = 'T', */
+
+/*  where Q is a real orthogonal matrix defined as the product of k */
+/*  elementary reflectors */
+
+/*        Q = H(1) H(2) . . . H(k) */
+
+/*  as returned by DGEQRF. Q is of order m if SIDE = 'L' and of order n */
+/*  if SIDE = 'R'. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SIDE    (input) CHARACTER*1 */
+/*          = 'L': apply Q or Q' from the Left */
+/*          = 'R': apply Q or Q' from the Right */
+
+/*  TRANS   (input) CHARACTER*1 */
+/*          = 'N': apply Q  (No transpose) */
+/*          = 'T': apply Q' (Transpose) */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix C. M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix C. N >= 0. */
+
+/*  K       (input) INTEGER */
+/*          The number of elementary reflectors whose product defines */
+/*          the matrix Q. */
+/*          If SIDE = 'L', M >= K >= 0; */
+/*          if SIDE = 'R', N >= K >= 0. */
+
+/*  A       (input) DOUBLE PRECISION array, dimension (LDA,K) */
+/*          The i-th column must contain the vector which defines the */
+/*          elementary reflector H(i), for i = 1,2,...,k, as returned by */
+/*          DGEQRF in the first k columns of its array argument A. */
+/*          A is modified by the routine but restored on exit. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A. */
+/*          If SIDE = 'L', LDA >= max(1,M); */
+/*          if SIDE = 'R', LDA >= max(1,N). */
+
+/*  TAU     (input) DOUBLE PRECISION array, dimension (K) */
+/*          TAU(i) must contain the scalar factor of the elementary */
+/*          reflector H(i), as returned by DGEQRF. */
+
+/*  C       (input/output) DOUBLE PRECISION array, dimension (LDC,N) */
+/*          On entry, the m by n matrix C. */
+/*          On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q. */
+
+/*  LDC     (input) INTEGER */
+/*          The leading dimension of the array C. LDC >= max(1,M). */
+
+/*  WORK    (workspace) DOUBLE PRECISION array, dimension */
+/*                                   (N) if SIDE = 'L', */
+/*                                   (M) if SIDE = 'R' */
+
+/*  INFO    (output) INTEGER */
+/*          = 0: successful exit */
+/*          < 0: if INFO = -i, the i-th argument had an illegal value */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORM2R", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && ! notran || ! left && notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) */
+
+	aii = a[i__ + i__ * a_dim1];
+	a[i__ + i__ * a_dim1] = 1.;
+	dlarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], &c__1, &tau[i__], &c__[
+		ic + jc * c_dim1], ldc, &work[1]);
+	a[i__ + i__ * a_dim1] = aii;
+/* L10: */
+    }
+    return 0;
+
+/*     End of DORM2R */
+
+} /* dorm2r_ */

+ 231 - 0
min-dgels/additional/dorml2.c

@@ -0,0 +1,231 @@
+/* dorml2.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dorml2_(char *side, char *trans, integer *m, integer *n, 
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2;
+
+    /* Local variables */
+    integer i__, i1, i2, i3, ic, jc, mi, ni, nq;
+    doublereal aii;
+    logical left;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *);
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    logical notran;
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DORML2 overwrites the general real m by n matrix C with */
+
+/*        Q * C  if SIDE = 'L' and TRANS = 'N', or */
+
+/*        Q'* C  if SIDE = 'L' and TRANS = 'T', or */
+
+/*        C * Q  if SIDE = 'R' and TRANS = 'N', or */
+
+/*        C * Q' if SIDE = 'R' and TRANS = 'T', */
+
+/*  where Q is a real orthogonal matrix defined as the product of k */
+/*  elementary reflectors */
+
+/*        Q = H(k) . . . H(2) H(1) */
+
+/*  as returned by DGELQF. Q is of order m if SIDE = 'L' and of order n */
+/*  if SIDE = 'R'. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SIDE    (input) CHARACTER*1 */
+/*          = 'L': apply Q or Q' from the Left */
+/*          = 'R': apply Q or Q' from the Right */
+
+/*  TRANS   (input) CHARACTER*1 */
+/*          = 'N': apply Q  (No transpose) */
+/*          = 'T': apply Q' (Transpose) */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix C. M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix C. N >= 0. */
+
+/*  K       (input) INTEGER */
+/*          The number of elementary reflectors whose product defines */
+/*          the matrix Q. */
+/*          If SIDE = 'L', M >= K >= 0; */
+/*          if SIDE = 'R', N >= K >= 0. */
+
+/*  A       (input) DOUBLE PRECISION array, dimension */
+/*                               (LDA,M) if SIDE = 'L', */
+/*                               (LDA,N) if SIDE = 'R' */
+/*          The i-th row must contain the vector which defines the */
+/*          elementary reflector H(i), for i = 1,2,...,k, as returned by */
+/*          DGELQF in the first k rows of its array argument A. */
+/*          A is modified by the routine but restored on exit. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A. LDA >= max(1,K). */
+
+/*  TAU     (input) DOUBLE PRECISION array, dimension (K) */
+/*          TAU(i) must contain the scalar factor of the elementary */
+/*          reflector H(i), as returned by DGELQF. */
+
+/*  C       (input/output) DOUBLE PRECISION array, dimension (LDC,N) */
+/*          On entry, the m by n matrix C. */
+/*          On exit, C is overwritten by Q*C or Q'*C or C*Q' or C*Q. */
+
+/*  LDC     (input) INTEGER */
+/*          The leading dimension of the array C. LDC >= max(1,M). */
+
+/*  WORK    (workspace) DOUBLE PRECISION array, dimension */
+/*                                   (N) if SIDE = 'L', */
+/*                                   (M) if SIDE = 'R' */
+
+/*  INFO    (output) INTEGER */
+/*          = 0: successful exit */
+/*          < 0: if INFO = -i, the i-th argument had an illegal value */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+
+/*     NQ is the order of Q */
+
+    if (left) {
+	nq = *m;
+    } else {
+	nq = *n;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORML2", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	return 0;
+    }
+
+    if (left && notran || ! left && ! notran) {
+	i1 = 1;
+	i2 = *k;
+	i3 = 1;
+    } else {
+	i1 = *k;
+	i2 = 1;
+	i3 = -1;
+    }
+
+    if (left) {
+	ni = *n;
+	jc = 1;
+    } else {
+	mi = *m;
+	ic = 1;
+    }
+
+    i__1 = i2;
+    i__2 = i3;
+    for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	if (left) {
+
+/*           H(i) is applied to C(i:m,1:n) */
+
+	    mi = *m - i__ + 1;
+	    ic = i__;
+	} else {
+
+/*           H(i) is applied to C(1:m,i:n) */
+
+	    ni = *n - i__ + 1;
+	    jc = i__;
+	}
+
+/*        Apply H(i) */
+
+	aii = a[i__ + i__ * a_dim1];
+	a[i__ + i__ * a_dim1] = 1.;
+	dlarf_(side, &mi, &ni, &a[i__ + i__ * a_dim1], lda, &tau[i__], &c__[
+		ic + jc * c_dim1], ldc, &work[1]);
+	a[i__ + i__ * a_dim1] = aii;
+/* L10: */
+    }
+    return 0;
+
+/*     End of DORML2 */
+
+} /* dorml2_ */

+ 334 - 0
min-dgels/additional/dormlq.c

@@ -0,0 +1,334 @@
+/* dormlq.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__2 = 2;
+static integer c__65 = 65;
+
+/* Subroutine */ int dormlq_(char *side, char *trans, integer *m, integer *n, 
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4, 
+	    i__5;
+    char ch__1[2];
+
+    /* Builtin functions */
+    /* Subroutine */ int s_cat(char *, char **, integer *, integer *, ftnlen);
+
+    /* Local variables */
+    integer i__;
+    doublereal t[4160]	/* was [65][64] */;
+    integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    logical left;
+    extern logical lsame_(char *, char *);
+    integer nbmin, iinfo;
+    extern /* Subroutine */ int dorml2_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *), dlarfb_(char 
+	    *, char *, char *, char *, integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal 
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *);
+    logical notran;
+    integer ldwork;
+    char transt[1];
+    integer lwkopt;
+    logical lquery;
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DORMLQ overwrites the general real M-by-N matrix C with */
+
+/*                  SIDE = 'L'     SIDE = 'R' */
+/*  TRANS = 'N':      Q * C          C * Q */
+/*  TRANS = 'T':      Q**T * C       C * Q**T */
+
+/*  where Q is a real orthogonal matrix defined as the product of k */
+/*  elementary reflectors */
+
+/*        Q = H(k) . . . H(2) H(1) */
+
+/*  as returned by DGELQF. Q is of order M if SIDE = 'L' and of order N */
+/*  if SIDE = 'R'. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SIDE    (input) CHARACTER*1 */
+/*          = 'L': apply Q or Q**T from the Left; */
+/*          = 'R': apply Q or Q**T from the Right. */
+
+/*  TRANS   (input) CHARACTER*1 */
+/*          = 'N':  No transpose, apply Q; */
+/*          = 'T':  Transpose, apply Q**T. */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix C. M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix C. N >= 0. */
+
+/*  K       (input) INTEGER */
+/*          The number of elementary reflectors whose product defines */
+/*          the matrix Q. */
+/*          If SIDE = 'L', M >= K >= 0; */
+/*          if SIDE = 'R', N >= K >= 0. */
+
+/*  A       (input) DOUBLE PRECISION array, dimension */
+/*                               (LDA,M) if SIDE = 'L', */
+/*                               (LDA,N) if SIDE = 'R' */
+/*          The i-th row must contain the vector which defines the */
+/*          elementary reflector H(i), for i = 1,2,...,k, as returned by */
+/*          DGELQF in the first k rows of its array argument A. */
+/*          A is modified by the routine but restored on exit. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A. LDA >= max(1,K). */
+
+/*  TAU     (input) DOUBLE PRECISION array, dimension (K) */
+/*          TAU(i) must contain the scalar factor of the elementary */
+/*          reflector H(i), as returned by DGELQF. */
+
+/*  C       (input/output) DOUBLE PRECISION array, dimension (LDC,N) */
+/*          On entry, the M-by-N matrix C. */
+/*          On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q. */
+
+/*  LDC     (input) INTEGER */
+/*          The leading dimension of the array C. LDC >= max(1,M). */
+
+/*  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */
+/*          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+
+/*  LWORK   (input) INTEGER */
+/*          The dimension of the array WORK. */
+/*          If SIDE = 'L', LWORK >= max(1,N); */
+/*          if SIDE = 'R', LWORK >= max(1,M). */
+/*          For optimum performance LWORK >= N*NB if SIDE = 'L', and */
+/*          LWORK >= M*NB if SIDE = 'R', where NB is the optimal */
+/*          blocksize. */
+
+/*          If LWORK = -1, then a workspace query is assumed; the routine */
+/*          only calculates the optimal size of the WORK array, returns */
+/*          this value as the first entry of the WORK array, and no error */
+/*          message related to LWORK is issued by XERBLA. */
+
+/*  INFO    (output) INTEGER */
+/*          = 0:  successful exit */
+/*          < 0:  if INFO = -i, the i-th argument had an illegal value */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Local Arrays .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,*k)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*        Determine the block size.  NB may be at most NBMAX, where NBMAX */
+/*        is used to define the local array T. */
+
+/* Computing MIN */
+/* Writing concatenation */
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "DORMLQ", ch__1, m, n, k, &c_n1);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORMLQ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/* Computing MAX */
+/* Writing concatenation */
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "DORMLQ", ch__1, m, n, k, &c_n1);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	dorml2_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && notran || ! left && ! notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	if (notran) {
+	    *(unsigned char *)transt = 'T';
+	} else {
+	    *(unsigned char *)transt = 'N';
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*           Form the triangular factor of the block reflector */
+/*           H = H(i) H(i+1) . . . H(i+ib-1) */
+
+	    i__4 = nq - i__ + 1;
+	    dlarft_("Forward", "Rowwise", &i__4, &ib, &a[i__ + i__ * a_dim1], 
+		    lda, &tau[i__], t, &c__65);
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    dlarfb_(side, transt, "Forward", "Rowwise", &mi, &ni, &ib, &a[i__ 
+		    + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc * c_dim1], 
+		    ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORMLQ */
+
+} /* dormlq_ */

+ 327 - 0
min-dgels/additional/dormqr.c

@@ -0,0 +1,327 @@
+/* dormqr.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__2 = 2;
+static integer c__65 = 65;
+
+/* Subroutine */ int dormqr_(char *side, char *trans, integer *m, integer *n, 
+	integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *
+	c__, integer *ldc, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    address a__1[2];
+    integer a_dim1, a_offset, c_dim1, c_offset, i__1, i__2, i__3[2], i__4, 
+	    i__5;
+    char ch__1[2];
+
+    /* Builtin functions */
+    /* Subroutine */ int s_cat(char *, char **, integer *, integer *, ftnlen);
+
+    /* Local variables */
+    integer i__;
+    doublereal t[4160]	/* was [65][64] */;
+    integer i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw, iws;
+    logical left;
+    extern logical lsame_(char *, char *);
+    integer nbmin, iinfo;
+    extern /* Subroutine */ int dorm2r_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *), dlarfb_(char 
+	    *, char *, char *, char *, integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal 
+	    *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *);
+    logical notran;
+    integer ldwork, lwkopt;
+    logical lquery;
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DORMQR overwrites the general real M-by-N matrix C with */
+
+/*                  SIDE = 'L'     SIDE = 'R' */
+/*  TRANS = 'N':      Q * C          C * Q */
+/*  TRANS = 'T':      Q**T * C       C * Q**T */
+
+/*  where Q is a real orthogonal matrix defined as the product of k */
+/*  elementary reflectors */
+
+/*        Q = H(1) H(2) . . . H(k) */
+
+/*  as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N */
+/*  if SIDE = 'R'. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SIDE    (input) CHARACTER*1 */
+/*          = 'L': apply Q or Q**T from the Left; */
+/*          = 'R': apply Q or Q**T from the Right. */
+
+/*  TRANS   (input) CHARACTER*1 */
+/*          = 'N':  No transpose, apply Q; */
+/*          = 'T':  Transpose, apply Q**T. */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix C. M >= 0. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix C. N >= 0. */
+
+/*  K       (input) INTEGER */
+/*          The number of elementary reflectors whose product defines */
+/*          the matrix Q. */
+/*          If SIDE = 'L', M >= K >= 0; */
+/*          if SIDE = 'R', N >= K >= 0. */
+
+/*  A       (input) DOUBLE PRECISION array, dimension (LDA,K) */
+/*          The i-th column must contain the vector which defines the */
+/*          elementary reflector H(i), for i = 1,2,...,k, as returned by */
+/*          DGEQRF in the first k columns of its array argument A. */
+/*          A is modified by the routine but restored on exit. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A. */
+/*          If SIDE = 'L', LDA >= max(1,M); */
+/*          if SIDE = 'R', LDA >= max(1,N). */
+
+/*  TAU     (input) DOUBLE PRECISION array, dimension (K) */
+/*          TAU(i) must contain the scalar factor of the elementary */
+/*          reflector H(i), as returned by DGEQRF. */
+
+/*  C       (input/output) DOUBLE PRECISION array, dimension (LDC,N) */
+/*          On entry, the M-by-N matrix C. */
+/*          On exit, C is overwritten by Q*C or Q**T*C or C*Q**T or C*Q. */
+
+/*  LDC     (input) INTEGER */
+/*          The leading dimension of the array C. LDC >= max(1,M). */
+
+/*  WORK    (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */
+/*          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+
+/*  LWORK   (input) INTEGER */
+/*          The dimension of the array WORK. */
+/*          If SIDE = 'L', LWORK >= max(1,N); */
+/*          if SIDE = 'R', LWORK >= max(1,M). */
+/*          For optimum performance LWORK >= N*NB if SIDE = 'L', and */
+/*          LWORK >= M*NB if SIDE = 'R', where NB is the optimal */
+/*          blocksize. */
+
+/*          If LWORK = -1, then a workspace query is assumed; the routine */
+/*          only calculates the optimal size of the WORK array, returns */
+/*          this value as the first entry of the WORK array, and no error */
+/*          message related to LWORK is issued by XERBLA. */
+
+/*  INFO    (output) INTEGER */
+/*          = 0:  successful exit */
+/*          < 0:  if INFO = -i, the i-th argument had an illegal value */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Local Arrays .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input arguments */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1;
+    c__ -= c_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    left = lsame_(side, "L");
+    notran = lsame_(trans, "N");
+    lquery = *lwork == -1;
+
+/*     NQ is the order of Q and NW is the minimum dimension of WORK */
+
+    if (left) {
+	nq = *m;
+	nw = *n;
+    } else {
+	nq = *n;
+	nw = *m;
+    }
+    if (! left && ! lsame_(side, "R")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T")) {
+	*info = -2;
+    } else if (*m < 0) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*k < 0 || *k > nq) {
+	*info = -5;
+    } else if (*lda < max(1,nq)) {
+	*info = -7;
+    } else if (*ldc < max(1,*m)) {
+	*info = -10;
+    } else if (*lwork < max(1,nw) && ! lquery) {
+	*info = -12;
+    }
+
+    if (*info == 0) {
+
+/*        Determine the block size.  NB may be at most NBMAX, where NBMAX */
+/*        is used to define the local array T. */
+
+/* Computing MIN */
+/* Writing concatenation */
+	i__3[0] = 1, a__1[0] = side;
+	i__3[1] = 1, a__1[1] = trans;
+	s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	i__1 = 64, i__2 = ilaenv_(&c__1, "DORMQR", ch__1, m, n, k, &c_n1);
+	nb = min(i__1,i__2);
+	lwkopt = max(1,nw) * nb;
+	work[1] = (doublereal) lwkopt;
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DORMQR", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*m == 0 || *n == 0 || *k == 0) {
+	work[1] = 1.;
+	return 0;
+    }
+
+    nbmin = 2;
+    ldwork = nw;
+    if (nb > 1 && nb < *k) {
+	iws = nw * nb;
+	if (*lwork < iws) {
+	    nb = *lwork / ldwork;
+/* Computing MAX */
+/* Writing concatenation */
+	    i__3[0] = 1, a__1[0] = side;
+	    i__3[1] = 1, a__1[1] = trans;
+	    s_cat(ch__1, a__1, i__3, &c__2, (ftnlen)2);
+	    i__1 = 2, i__2 = ilaenv_(&c__2, "DORMQR", ch__1, m, n, k, &c_n1);
+	    nbmin = max(i__1,i__2);
+	}
+    } else {
+	iws = nw;
+    }
+
+    if (nb < nbmin || nb >= *k) {
+
+/*        Use unblocked code */
+
+	dorm2r_(side, trans, m, n, k, &a[a_offset], lda, &tau[1], &c__[
+		c_offset], ldc, &work[1], &iinfo);
+    } else {
+
+/*        Use blocked code */
+
+	if (left && ! notran || ! left && notran) {
+	    i1 = 1;
+	    i2 = *k;
+	    i3 = nb;
+	} else {
+	    i1 = (*k - 1) / nb * nb + 1;
+	    i2 = 1;
+	    i3 = -nb;
+	}
+
+	if (left) {
+	    ni = *n;
+	    jc = 1;
+	} else {
+	    mi = *m;
+	    ic = 1;
+	}
+
+	i__1 = i2;
+	i__2 = i3;
+	for (i__ = i1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+/* Computing MIN */
+	    i__4 = nb, i__5 = *k - i__ + 1;
+	    ib = min(i__4,i__5);
+
+/*           Form the triangular factor of the block reflector */
+/*           H = H(i) H(i+1) . . . H(i+ib-1) */
+
+	    i__4 = nq - i__ + 1;
+	    dlarft_("Forward", "Columnwise", &i__4, &ib, &a[i__ + i__ * 
+		    a_dim1], lda, &tau[i__], t, &c__65)
+		    ;
+	    if (left) {
+
+/*              H or H' is applied to C(i:m,1:n) */
+
+		mi = *m - i__ + 1;
+		ic = i__;
+	    } else {
+
+/*              H or H' is applied to C(1:m,i:n) */
+
+		ni = *n - i__ + 1;
+		jc = i__;
+	    }
+
+/*           Apply H or H' */
+
+	    dlarfb_(side, trans, "Forward", "Columnwise", &mi, &ni, &ib, &a[
+		    i__ + i__ * a_dim1], lda, t, &c__65, &c__[ic + jc * 
+		    c_dim1], ldc, &work[1], &ldwork);
+/* L10: */
+	}
+    }
+    work[1] = (doublereal) lwkopt;
+    return 0;
+
+/*     End of DORMQR */
+
+} /* dormqr_ */

+ 96 - 0
min-dgels/additional/dscal.c

@@ -0,0 +1,96 @@
+/* dscal.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dscal_(integer *n, doublereal *da, doublereal *dx, 
+	integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+
+    /* Local variables */
+    integer i__, m, mp1, nincx;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+/* * */
+/*     scales a vector by a constant. */
+/*     uses unrolled loops for increment equal to one. */
+/*     jack dongarra, linpack, 3/11/78. */
+/*     modified 3/93 to return if incx .le. 0. */
+/*     modified 12/3/93, array(1) declarations changed to array(*) */
+
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+    /* Parameter adjustments */
+    --dx;
+
+    /* Function Body */
+    if (*n <= 0 || *incx <= 0) {
+	return 0;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	dx[i__] = *da * dx[i__];
+/* L10: */
+    }
+    return 0;
+
+/*        code for increment equal to 1 */
+
+
+/*        clean-up loop */
+
+L20:
+    m = *n % 5;
+    if (m == 0) {
+	goto L40;
+    }
+    i__2 = m;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	dx[i__] = *da * dx[i__];
+/* L30: */
+    }
+    if (*n < 5) {
+	return 0;
+    }
+L40:
+    mp1 = m + 1;
+    i__2 = *n;
+    for (i__ = mp1; i__ <= i__2; i__ += 5) {
+	dx[i__] = *da * dx[i__];
+	dx[i__ + 1] = *da * dx[i__ + 1];
+	dx[i__ + 2] = *da * dx[i__ + 2];
+	dx[i__ + 3] = *da * dx[i__ + 3];
+	dx[i__ + 4] = *da * dx[i__ + 4];
+/* L50: */
+    }
+    return 0;
+} /* dscal_ */

+ 453 - 0
min-dgels/additional/dtrmm.c

@@ -0,0 +1,453 @@
+/* dtrmm.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dtrmm_(char *side, char *uplo, char *transa, char *diag, 
+	integer *m, integer *n, doublereal *alpha, doublereal *a, integer *
+	lda, doublereal *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    integer i__, j, k, info;
+    doublereal temp;
+    logical lside;
+    extern logical lsame_(char *, char *);
+    integer nrowa;
+    logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    logical nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DTRMM  performs one of the matrix-matrix operations */
+
+/*     B := alpha*op( A )*B,   or   B := alpha*B*op( A ), */
+
+/*  where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or */
+/*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of */
+
+/*     op( A ) = A   or   op( A ) = A'. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  SIDE   - CHARACTER*1. */
+/*           On entry,  SIDE specifies whether  op( A ) multiplies B from */
+/*           the left or right as follows: */
+
+/*              SIDE = 'L' or 'l'   B := alpha*op( A )*B. */
+
+/*              SIDE = 'R' or 'r'   B := alpha*B*op( A ). */
+
+/*           Unchanged on exit. */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix A is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANSA - CHARACTER*1. */
+/*           On entry, TRANSA specifies the form of op( A ) to be used in */
+/*           the matrix multiplication as follows: */
+
+/*              TRANSA = 'N' or 'n'   op( A ) = A. */
+
+/*              TRANSA = 'T' or 't'   op( A ) = A'. */
+
+/*              TRANSA = 'C' or 'c'   op( A ) = A'. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit triangular */
+/*           as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  M      - INTEGER. */
+/*           On entry, M specifies the number of rows of B. M must be at */
+/*           least zero. */
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the number of columns of B.  N must be */
+/*           at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is */
+/*           zero then  A is not referenced and  B need not be set before */
+/*           entry. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m */
+/*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'. */
+/*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k */
+/*           upper triangular part of the array  A must contain the upper */
+/*           triangular matrix  and the strictly lower triangular part of */
+/*           A is not referenced. */
+/*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k */
+/*           lower triangular part of the array  A must contain the lower */
+/*           triangular matrix  and the strictly upper triangular part of */
+/*           A is not referenced. */
+/*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of */
+/*           A  are not referenced either,  but are assumed to be  unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then */
+/*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r' */
+/*           then LDA must be at least max( 1, n ). */
+/*           Unchanged on exit. */
+
+/*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ). */
+/*           Before entry,  the leading  m by n part of the array  B must */
+/*           contain the matrix  B,  and  on exit  is overwritten  by the */
+/*           transformed matrix. */
+
+/*  LDB    - INTEGER. */
+/*           On entry, LDB specifies the first dimension of B as declared */
+/*           in  the  calling  (sub)  program.   LDB  must  be  at  least */
+/*           max( 1, m ). */
+/*           Unchanged on exit. */
+
+
+/*  Level 3 Blas routine. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Parameters .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa, 
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag, 
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("DTRMM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*A*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			if (b[k + j * b_dim1] != 0.) {
+			    temp = *alpha * b[k + j * b_dim1];
+			    i__3 = k - 1;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * a[i__ + k * 
+					a_dim1];
+/* L30: */
+			    }
+			    if (nounit) {
+				temp *= a[k + k * a_dim1];
+			    }
+			    b[k + j * b_dim1] = temp;
+			}
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (k = *m; k >= 1; --k) {
+			if (b[k + j * b_dim1] != 0.) {
+			    temp = *alpha * b[k + j * b_dim1];
+			    b[k + j * b_dim1] = temp;
+			    if (nounit) {
+				b[k + j * b_dim1] *= a[k + k * a_dim1];
+			    }
+			    i__2 = *m;
+			    for (i__ = k + 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * a[i__ + k * 
+					a_dim1];
+/* L60: */
+			    }
+			}
+/* L70: */
+		    }
+/* L80: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*A'*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			temp = b[i__ + j * b_dim1];
+			if (nounit) {
+			    temp *= a[i__ + i__ * a_dim1];
+			}
+			i__2 = i__ - 1;
+			for (k = 1; k <= i__2; ++k) {
+			    temp += a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L90: */
+			}
+			b[i__ + j * b_dim1] = *alpha * temp;
+/* L100: */
+		    }
+/* L110: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			temp = b[i__ + j * b_dim1];
+			if (nounit) {
+			    temp *= a[i__ + i__ * a_dim1];
+			}
+			i__3 = *m;
+			for (k = i__ + 1; k <= i__3; ++k) {
+			    temp += a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L120: */
+			}
+			b[i__ + j * b_dim1] = *alpha * temp;
+/* L130: */
+		    }
+/* L140: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*A. */
+
+	    if (upper) {
+		for (j = *n; j >= 1; --j) {
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__1 = *m;
+		    for (i__ = 1; i__ <= i__1; ++i__) {
+			b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L150: */
+		    }
+		    i__1 = j - 1;
+		    for (k = 1; k <= i__1; ++k) {
+			if (a[k + j * a_dim1] != 0.) {
+			    temp = *alpha * a[k + j * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k * 
+					b_dim1];
+/* L160: */
+			    }
+			}
+/* L170: */
+		    }
+/* L180: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L190: */
+		    }
+		    i__2 = *n;
+		    for (k = j + 1; k <= i__2; ++k) {
+			if (a[k + j * a_dim1] != 0.) {
+			    temp = *alpha * a[k + j * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k * 
+					b_dim1];
+/* L200: */
+			    }
+			}
+/* L210: */
+		    }
+/* L220: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*A'. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    i__2 = k - 1;
+		    for (j = 1; j <= i__2; ++j) {
+			if (a[j + k * a_dim1] != 0.) {
+			    temp = *alpha * a[j + k * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k * 
+					b_dim1];
+/* L230: */
+			    }
+			}
+/* L240: */
+		    }
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[k + k * a_dim1];
+		    }
+		    if (temp != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L250: */
+			}
+		    }
+/* L260: */
+		}
+	    } else {
+		for (k = *n; k >= 1; --k) {
+		    i__1 = *n;
+		    for (j = k + 1; j <= i__1; ++j) {
+			if (a[j + k * a_dim1] != 0.) {
+			    temp = *alpha * a[j + k * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] += temp * b[i__ + k * 
+					b_dim1];
+/* L270: */
+			    }
+			}
+/* L280: */
+		    }
+		    temp = *alpha;
+		    if (nounit) {
+			temp *= a[k + k * a_dim1];
+		    }
+		    if (temp != 1.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L290: */
+			}
+		    }
+/* L300: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRMM . */
+
+} /* dtrmm_ */

+ 345 - 0
min-dgels/additional/dtrmv.c

@@ -0,0 +1,345 @@
+/* dtrmv.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dtrmv_(char *uplo, char *trans, char *diag, integer *n, 
+	doublereal *a, integer *lda, doublereal *x, integer *incx)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+
+    /* Local variables */
+    integer i__, j, ix, jx, kx, info;
+    doublereal temp;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    logical nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DTRMV  performs one of the matrix-vector operations */
+
+/*     x := A*x,   or   x := A'*x, */
+
+/*  where x is an n element vector and  A is an n by n unit, or non-unit, */
+/*  upper or lower triangular matrix. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANS  - CHARACTER*1. */
+/*           On entry, TRANS specifies the operation to be performed as */
+/*           follows: */
+
+/*              TRANS = 'N' or 'n'   x := A*x. */
+
+/*              TRANS = 'T' or 't'   x := A'*x. */
+
+/*              TRANS = 'C' or 'c'   x := A'*x. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit */
+/*           triangular as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the order of the matrix A. */
+/*           N must be at least zero. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
+/*           Before entry with  UPLO = 'U' or 'u', the leading n by n */
+/*           upper triangular part of the array A must contain the upper */
+/*           triangular matrix and the strictly lower triangular part of */
+/*           A is not referenced. */
+/*           Before entry with UPLO = 'L' or 'l', the leading n by n */
+/*           lower triangular part of the array A must contain the lower */
+/*           triangular matrix and the strictly upper triangular part of */
+/*           A is not referenced. */
+/*           Note that when  DIAG = 'U' or 'u', the diagonal elements of */
+/*           A are not referenced either, but are assumed to be unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program. LDA must be at least */
+/*           max( 1, n ). */
+/*           Unchanged on exit. */
+
+/*  X      - DOUBLE PRECISION array of dimension at least */
+/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
+/*           Before entry, the incremented array X must contain the n */
+/*           element vector x. On exit, X is overwritten with the */
+/*           tranformed vector x. */
+
+/*  INCX   - INTEGER. */
+/*           On entry, INCX specifies the increment for the elements of */
+/*           X. INCX must not be zero. */
+/*           Unchanged on exit. */
+
+
+/*  Level 2 Blas routine. */
+
+/*  -- Written on 22-October-1986. */
+/*     Jack Dongarra, Argonne National Lab. */
+/*     Jeremy Du Croz, Nag Central Office. */
+/*     Sven Hammarling, Nag Central Office. */
+/*     Richard Hanson, Sandia National Labs. */
+
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --x;
+
+    /* Function Body */
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans, 
+	    "T") && ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag, 
+	    "N")) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < max(1,*n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	xerbla_("DTRMV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = lsame_(diag, "N");
+
+/*     Set up the start point in X if the increment is not unity. This */
+/*     will be  ( N - 1 )*INCX  too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are */
+/*     accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  x := A*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[j] != 0.) {
+			temp = x[j];
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    x[i__] += temp * a[i__ + j * a_dim1];
+/* L10: */
+			}
+			if (nounit) {
+			    x[j] *= a[j + j * a_dim1];
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (x[jx] != 0.) {
+			temp = x[jx];
+			ix = kx;
+			i__2 = j - 1;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    x[ix] += temp * a[i__ + j * a_dim1];
+			    ix += *incx;
+/* L30: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j + j * a_dim1];
+			}
+		    }
+		    jx += *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (x[j] != 0.) {
+			temp = x[j];
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    x[i__] += temp * a[i__ + j * a_dim1];
+/* L50: */
+			}
+			if (nounit) {
+			    x[j] *= a[j + j * a_dim1];
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    if (x[jx] != 0.) {
+			temp = x[jx];
+			ix = kx;
+			i__1 = j + 1;
+			for (i__ = *n; i__ >= i__1; --i__) {
+			    x[ix] += temp * a[i__ + j * a_dim1];
+			    ix -= *incx;
+/* L70: */
+			}
+			if (nounit) {
+			    x[jx] *= a[j + j * a_dim1];
+			}
+		    }
+		    jx -= *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := A'*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = x[j];
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    for (i__ = j - 1; i__ >= 1; --i__) {
+			temp += a[i__ + j * a_dim1] * x[i__];
+/* L90: */
+		    }
+		    x[j] = temp;
+/* L100: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    temp = x[jx];
+		    ix = jx;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    for (i__ = j - 1; i__ >= 1; --i__) {
+			ix -= *incx;
+			temp += a[i__ + j * a_dim1] * x[ix];
+/* L110: */
+		    }
+		    x[jx] = temp;
+		    jx -= *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = x[j];
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			temp += a[i__ + j * a_dim1] * x[i__];
+/* L130: */
+		    }
+		    x[j] = temp;
+/* L140: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    temp = x[jx];
+		    ix = jx;
+		    if (nounit) {
+			temp *= a[j + j * a_dim1];
+		    }
+		    i__2 = *n;
+		    for (i__ = j + 1; i__ <= i__2; ++i__) {
+			ix += *incx;
+			temp += a[i__ + j * a_dim1] * x[ix];
+/* L150: */
+		    }
+		    x[jx] = temp;
+		    jx += *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRMV . */
+
+} /* dtrmv_ */

+ 490 - 0
min-dgels/additional/dtrsm.c

@@ -0,0 +1,490 @@
+/* dtrsm.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Subroutine */ int dtrsm_(char *side, char *uplo, char *transa, char *diag, 
+	integer *m, integer *n, doublereal *alpha, doublereal *a, integer *
+	lda, doublereal *b, integer *ldb)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    integer i__, j, k, info;
+    doublereal temp;
+    logical lside;
+    extern logical lsame_(char *, char *);
+    integer nrowa;
+    logical upper;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    logical nounit;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DTRSM  solves one of the matrix equations */
+
+/*     op( A )*X = alpha*B,   or   X*op( A ) = alpha*B, */
+
+/*  where alpha is a scalar, X and B are m by n matrices, A is a unit, or */
+/*  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of */
+
+/*     op( A ) = A   or   op( A ) = A'. */
+
+/*  The matrix X is overwritten on B. */
+
+/*  Arguments */
+/*  ========== */
+
+/*  SIDE   - CHARACTER*1. */
+/*           On entry, SIDE specifies whether op( A ) appears on the left */
+/*           or right of X as follows: */
+
+/*              SIDE = 'L' or 'l'   op( A )*X = alpha*B. */
+
+/*              SIDE = 'R' or 'r'   X*op( A ) = alpha*B. */
+
+/*           Unchanged on exit. */
+
+/*  UPLO   - CHARACTER*1. */
+/*           On entry, UPLO specifies whether the matrix A is an upper or */
+/*           lower triangular matrix as follows: */
+
+/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */
+
+/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */
+
+/*           Unchanged on exit. */
+
+/*  TRANSA - CHARACTER*1. */
+/*           On entry, TRANSA specifies the form of op( A ) to be used in */
+/*           the matrix multiplication as follows: */
+
+/*              TRANSA = 'N' or 'n'   op( A ) = A. */
+
+/*              TRANSA = 'T' or 't'   op( A ) = A'. */
+
+/*              TRANSA = 'C' or 'c'   op( A ) = A'. */
+
+/*           Unchanged on exit. */
+
+/*  DIAG   - CHARACTER*1. */
+/*           On entry, DIAG specifies whether or not A is unit triangular */
+/*           as follows: */
+
+/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */
+
+/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
+/*                                  triangular. */
+
+/*           Unchanged on exit. */
+
+/*  M      - INTEGER. */
+/*           On entry, M specifies the number of rows of B. M must be at */
+/*           least zero. */
+/*           Unchanged on exit. */
+
+/*  N      - INTEGER. */
+/*           On entry, N specifies the number of columns of B.  N must be */
+/*           at least zero. */
+/*           Unchanged on exit. */
+
+/*  ALPHA  - DOUBLE PRECISION. */
+/*           On entry,  ALPHA specifies the scalar  alpha. When  alpha is */
+/*           zero then  A is not referenced and  B need not be set before */
+/*           entry. */
+/*           Unchanged on exit. */
+
+/*  A      - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m */
+/*           when  SIDE = 'L' or 'l'  and is  n  when  SIDE = 'R' or 'r'. */
+/*           Before entry  with  UPLO = 'U' or 'u',  the  leading  k by k */
+/*           upper triangular part of the array  A must contain the upper */
+/*           triangular matrix  and the strictly lower triangular part of */
+/*           A is not referenced. */
+/*           Before entry  with  UPLO = 'L' or 'l',  the  leading  k by k */
+/*           lower triangular part of the array  A must contain the lower */
+/*           triangular matrix  and the strictly upper triangular part of */
+/*           A is not referenced. */
+/*           Note that when  DIAG = 'U' or 'u',  the diagonal elements of */
+/*           A  are not referenced either,  but are assumed to be  unity. */
+/*           Unchanged on exit. */
+
+/*  LDA    - INTEGER. */
+/*           On entry, LDA specifies the first dimension of A as declared */
+/*           in the calling (sub) program.  When  SIDE = 'L' or 'l'  then */
+/*           LDA  must be at least  max( 1, m ),  when  SIDE = 'R' or 'r' */
+/*           then LDA must be at least max( 1, n ). */
+/*           Unchanged on exit. */
+
+/*  B      - DOUBLE PRECISION array of DIMENSION ( LDB, n ). */
+/*           Before entry,  the leading  m by n part of the array  B must */
+/*           contain  the  right-hand  side  matrix  B,  and  on exit  is */
+/*           overwritten by the solution matrix  X. */
+
+/*  LDB    - INTEGER. */
+/*           On entry, LDB specifies the first dimension of B as declared */
+/*           in  the  calling  (sub)  program.   LDB  must  be  at  least */
+/*           max( 1, m ). */
+/*           Unchanged on exit. */
+
+
+/*  Level 3 Blas routine. */
+
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Parameters .. */
+/*     .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    lside = lsame_(side, "L");
+    if (lside) {
+	nrowa = *m;
+    } else {
+	nrowa = *n;
+    }
+    nounit = lsame_(diag, "N");
+    upper = lsame_(uplo, "U");
+
+    info = 0;
+    if (! lside && ! lsame_(side, "R")) {
+	info = 1;
+    } else if (! upper && ! lsame_(uplo, "L")) {
+	info = 2;
+    } else if (! lsame_(transa, "N") && ! lsame_(transa, 
+	     "T") && ! lsame_(transa, "C")) {
+	info = 3;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag, 
+	    "N")) {
+	info = 4;
+    } else if (*m < 0) {
+	info = 5;
+    } else if (*n < 0) {
+	info = 6;
+    } else if (*lda < max(1,nrowa)) {
+	info = 9;
+    } else if (*ldb < max(1,*m)) {
+	info = 11;
+    }
+    if (info != 0) {
+	xerbla_("DTRSM ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     And when  alpha.eq.zero. */
+
+    if (*alpha == 0.) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		b[i__ + j * b_dim1] = 0.;
+/* L10: */
+	    }
+/* L20: */
+	}
+	return 0;
+    }
+
+/*     Start the operations. */
+
+    if (lside) {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*inv( A )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L30: */
+			}
+		    }
+		    for (k = *m; k >= 1; --k) {
+			if (b[k + j * b_dim1] != 0.) {
+			    if (nounit) {
+				b[k + j * b_dim1] /= a[k + k * a_dim1];
+			    }
+			    i__2 = k - 1;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= b[k + j * b_dim1] * a[
+					i__ + k * a_dim1];
+/* L40: */
+			    }
+			}
+/* L50: */
+		    }
+/* L60: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L70: */
+			}
+		    }
+		    i__2 = *m;
+		    for (k = 1; k <= i__2; ++k) {
+			if (b[k + j * b_dim1] != 0.) {
+			    if (nounit) {
+				b[k + j * b_dim1] /= a[k + k * a_dim1];
+			    }
+			    i__3 = *m;
+			    for (i__ = k + 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= b[k + j * b_dim1] * a[
+					i__ + k * a_dim1];
+/* L80: */
+			    }
+			}
+/* L90: */
+		    }
+/* L100: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*inv( A' )*B. */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    i__2 = *m;
+		    for (i__ = 1; i__ <= i__2; ++i__) {
+			temp = *alpha * b[i__ + j * b_dim1];
+			i__3 = i__ - 1;
+			for (k = 1; k <= i__3; ++k) {
+			    temp -= a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L110: */
+			}
+			if (nounit) {
+			    temp /= a[i__ + i__ * a_dim1];
+			}
+			b[i__ + j * b_dim1] = temp;
+/* L120: */
+		    }
+/* L130: */
+		}
+	    } else {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    for (i__ = *m; i__ >= 1; --i__) {
+			temp = *alpha * b[i__ + j * b_dim1];
+			i__2 = *m;
+			for (k = i__ + 1; k <= i__2; ++k) {
+			    temp -= a[k + i__ * a_dim1] * b[k + j * b_dim1];
+/* L140: */
+			}
+			if (nounit) {
+			    temp /= a[i__ + i__ * a_dim1];
+			}
+			b[i__ + j * b_dim1] = temp;
+/* L150: */
+		    }
+/* L160: */
+		}
+	    }
+	}
+    } else {
+	if (lsame_(transa, "N")) {
+
+/*           Form  B := alpha*B*inv( A ). */
+
+	    if (upper) {
+		i__1 = *n;
+		for (j = 1; j <= i__1; ++j) {
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L170: */
+			}
+		    }
+		    i__2 = j - 1;
+		    for (k = 1; k <= i__2; ++k) {
+			if (a[k + j * a_dim1] != 0.) {
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= a[k + j * a_dim1] * b[
+					i__ + k * b_dim1];
+/* L180: */
+			    }
+			}
+/* L190: */
+		    }
+		    if (nounit) {
+			temp = 1. / a[j + j * a_dim1];
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L200: */
+			}
+		    }
+/* L210: */
+		}
+	    } else {
+		for (j = *n; j >= 1; --j) {
+		    if (*alpha != 1.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + j * b_dim1] = *alpha * b[i__ + j * b_dim1]
+				    ;
+/* L220: */
+			}
+		    }
+		    i__1 = *n;
+		    for (k = j + 1; k <= i__1; ++k) {
+			if (a[k + j * a_dim1] != 0.) {
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= a[k + j * a_dim1] * b[
+					i__ + k * b_dim1];
+/* L230: */
+			    }
+			}
+/* L240: */
+		    }
+		    if (nounit) {
+			temp = 1. / a[j + j * a_dim1];
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + j * b_dim1] = temp * b[i__ + j * b_dim1];
+/* L250: */
+			}
+		    }
+/* L260: */
+		}
+	    }
+	} else {
+
+/*           Form  B := alpha*B*inv( A' ). */
+
+	    if (upper) {
+		for (k = *n; k >= 1; --k) {
+		    if (nounit) {
+			temp = 1. / a[k + k * a_dim1];
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L270: */
+			}
+		    }
+		    i__1 = k - 1;
+		    for (j = 1; j <= i__1; ++j) {
+			if (a[j + k * a_dim1] != 0.) {
+			    temp = a[j + k * a_dim1];
+			    i__2 = *m;
+			    for (i__ = 1; i__ <= i__2; ++i__) {
+				b[i__ + j * b_dim1] -= temp * b[i__ + k * 
+					b_dim1];
+/* L280: */
+			    }
+			}
+/* L290: */
+		    }
+		    if (*alpha != 1.) {
+			i__1 = *m;
+			for (i__ = 1; i__ <= i__1; ++i__) {
+			    b[i__ + k * b_dim1] = *alpha * b[i__ + k * b_dim1]
+				    ;
+/* L300: */
+			}
+		    }
+/* L310: */
+		}
+	    } else {
+		i__1 = *n;
+		for (k = 1; k <= i__1; ++k) {
+		    if (nounit) {
+			temp = 1. / a[k + k * a_dim1];
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = temp * b[i__ + k * b_dim1];
+/* L320: */
+			}
+		    }
+		    i__2 = *n;
+		    for (j = k + 1; j <= i__2; ++j) {
+			if (a[j + k * a_dim1] != 0.) {
+			    temp = a[j + k * a_dim1];
+			    i__3 = *m;
+			    for (i__ = 1; i__ <= i__3; ++i__) {
+				b[i__ + j * b_dim1] -= temp * b[i__ + k * 
+					b_dim1];
+/* L330: */
+			    }
+			}
+/* L340: */
+		    }
+		    if (*alpha != 1.) {
+			i__2 = *m;
+			for (i__ = 1; i__ <= i__2; ++i__) {
+			    b[i__ + k * b_dim1] = *alpha * b[i__ + k * b_dim1]
+				    ;
+/* L350: */
+			}
+		    }
+/* L360: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of DTRSM . */
+
+} /* dtrsm_ */

+ 183 - 0
min-dgels/additional/dtrtrs.c

@@ -0,0 +1,183 @@
+/* dtrtrs.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+/* Table of constant values */
+
+static doublereal c_b12 = 1.;
+
+/* Subroutine */ int dtrtrs_(char *uplo, char *trans, char *diag, integer *n, 
+	integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *
+	ldb, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *), xerbla_(
+	    char *, integer *);
+    logical nounit;
+
+
+/*  -- LAPACK routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  DTRTRS solves a triangular system of the form */
+
+/*     A * X = B  or  A**T * X = B, */
+
+/*  where A is a triangular matrix of order N, and B is an N-by-NRHS */
+/*  matrix.  A check is made to verify that A is nonsingular. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  UPLO    (input) CHARACTER*1 */
+/*          = 'U':  A is upper triangular; */
+/*          = 'L':  A is lower triangular. */
+
+/*  TRANS   (input) CHARACTER*1 */
+/*          Specifies the form of the system of equations: */
+/*          = 'N':  A * X = B  (No transpose) */
+/*          = 'T':  A**T * X = B  (Transpose) */
+/*          = 'C':  A**H * X = B  (Conjugate transpose = Transpose) */
+
+/*  DIAG    (input) CHARACTER*1 */
+/*          = 'N':  A is non-unit triangular; */
+/*          = 'U':  A is unit triangular. */
+
+/*  N       (input) INTEGER */
+/*          The order of the matrix A.  N >= 0. */
+
+/*  NRHS    (input) INTEGER */
+/*          The number of right hand sides, i.e., the number of columns */
+/*          of the matrix B.  NRHS >= 0. */
+
+/*  A       (input) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          The triangular matrix A.  If UPLO = 'U', the leading N-by-N */
+/*          upper triangular part of the array A contains the upper */
+/*          triangular matrix, and the strictly lower triangular part of */
+/*          A is not referenced.  If UPLO = 'L', the leading N-by-N lower */
+/*          triangular part of the array A contains the lower triangular */
+/*          matrix, and the strictly upper triangular part of A is not */
+/*          referenced.  If DIAG = 'U', the diagonal elements of A are */
+/*          also not referenced and are assumed to be 1. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A.  LDA >= max(1,N). */
+
+/*  B       (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) */
+/*          On entry, the right hand side matrix B. */
+/*          On exit, if INFO = 0, the solution matrix X. */
+
+/*  LDB     (input) INTEGER */
+/*          The leading dimension of the array B.  LDB >= max(1,N). */
+
+/*  INFO    (output) INTEGER */
+/*          = 0:  successful exit */
+/*          < 0: if INFO = -i, the i-th argument had an illegal value */
+/*          > 0: if INFO = i, the i-th diagonal element of A is zero, */
+/*               indicating that the matrix is singular and the solutions */
+/*               X have not been computed. */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. External Subroutines .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+
+    /* Function Body */
+    *info = 0;
+    nounit = lsame_(diag, "N");
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans, 
+	    "T") && ! lsame_(trans, "C")) {
+	*info = -2;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -3;
+    } else if (*n < 0) {
+	*info = -4;
+    } else if (*nrhs < 0) {
+	*info = -5;
+    } else if (*lda < max(1,*n)) {
+	*info = -7;
+    } else if (*ldb < max(1,*n)) {
+	*info = -9;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DTRTRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+/*     Check for singularity. */
+
+    if (nounit) {
+	i__1 = *n;
+	for (*info = 1; *info <= i__1; ++(*info)) {
+	    if (a[*info + *info * a_dim1] == 0.) {
+		return 0;
+	    }
+/* L10: */
+	}
+    }
+    *info = 0;
+
+/*     Solve A * x = b  or  A' * x = b. */
+
+    dtrsm_("Left", uplo, trans, diag, n, nrhs, &c_b12, &a[a_offset], lda, &b[
+	    b_offset], ldb);
+
+    return 0;
+
+/*     End of DTRTRS */
+
+} /* dtrtrs_ */

+ 223 - 0
min-dgels/additional/f2c.h

@@ -0,0 +1,223 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+typedef long int integer;
+typedef unsigned long int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+typedef long int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+#ifdef INTEGER_STAR_8	/* Adjust for integer*8. */
+typedef long long longint;		/* system-dependent */
+typedef unsigned long long ulongint;	/* system-dependent */
+#define qbit_clear(a,b)	((a) & ~((ulongint)1 << (b)))
+#define qbit_set(a,b)	((a) |  ((ulongint)1 << (b)))
+#endif
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+#ifdef f2c_i2
+/* for -i2 */
+typedef short flag;
+typedef short ftnlen;
+typedef short ftnint;
+#else
+typedef long int flag;
+typedef long int ftnlen;
+typedef long int ftnint;
+#endif
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+/*typedef long int Long;*/	/* No longer used; formerly in Namelist */
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (doublereal)abs(x)
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (doublereal)min(a,b)
+#define dmax(a,b) (doublereal)max(a,b)
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef int /* Unknown procedure type */ (*U_fp)(...);
+typedef shortint (*J_fp)(...);
+typedef integer (*I_fp)(...);
+typedef real (*R_fp)(...);
+typedef doublereal (*D_fp)(...), (*E_fp)(...);
+typedef /* Complex */ VOID (*C_fp)(...);
+typedef /* Double Complex */ VOID (*Z_fp)(...);
+typedef logical (*L_fp)(...);
+typedef shortlogical (*K_fp)(...);
+typedef /* Character */ VOID (*H_fp)(...);
+typedef /* Subroutine */ int (*S_fp)(...);
+#else
+typedef int /* Unknown procedure type */ (*U_fp)();
+typedef shortint (*J_fp)();
+typedef integer (*I_fp)();
+typedef real (*R_fp)();
+typedef doublereal (*D_fp)(), (*E_fp)();
+typedef /* Complex */ VOID (*C_fp)();
+typedef /* Double Complex */ VOID (*Z_fp)();
+typedef logical (*L_fp)();
+typedef shortlogical (*K_fp)();
+typedef /* Character */ VOID (*H_fp)();
+typedef /* Subroutine */ int (*S_fp)();
+#endif
+/* E_fp is for real functions when -R is not specified */
+typedef VOID C_f;	/* complex function */
+typedef VOID H_f;	/* character function */
+typedef VOID Z_f;	/* double complex function */
+typedef doublereal E_f;	/* real function with -R not specified */
+
+/* undef any lower-case symbols that your C compiler predefines, e.g.: */
+
+#ifndef Skip_f2c_Undefs
+#undef cray
+#undef gcos
+#undef mc68010
+#undef mc68020
+#undef mips
+#undef pdp11
+#undef sgi
+#undef sparc
+#undef sun
+#undef sun2
+#undef sun3
+#undef sun4
+#undef u370
+#undef u3b
+#undef u3b2
+#undef u3b5
+#undef unix
+#undef vax
+#endif
+#endif

+ 141 - 0
min-dgels/additional/fio.h

@@ -0,0 +1,141 @@
+#ifndef SYSDEP_H_INCLUDED
+#include "sysdep1.h"
+#endif
+#include "stdio.h"
+#include "errno.h"
+#ifndef NULL
+/* ANSI C */
+#include "stddef.h"
+#endif
+
+#ifndef SEEK_SET
+#define SEEK_SET 0
+#define SEEK_CUR 1
+#define SEEK_END 2
+#endif
+
+#ifndef FOPEN
+#define FOPEN fopen
+#endif
+
+#ifndef FREOPEN
+#define FREOPEN freopen
+#endif
+
+#ifndef FSEEK
+#define FSEEK fseek
+#endif
+
+#ifndef FSTAT
+#define FSTAT fstat
+#endif
+
+#ifndef FTELL
+#define FTELL ftell
+#endif
+
+#ifndef OFF_T
+#define OFF_T long
+#endif
+
+#ifndef STAT_ST
+#define STAT_ST stat
+#endif
+
+#ifndef STAT
+#define STAT stat
+#endif
+
+#ifdef MSDOS
+#ifndef NON_UNIX_STDIO
+#define NON_UNIX_STDIO
+#endif
+#endif
+
+#ifdef UIOLEN_int
+typedef int uiolen;
+#else
+typedef long uiolen;
+#endif
+
+/*units*/
+typedef struct
+{	FILE *ufd;	/*0=unconnected*/
+	char *ufnm;
+#ifndef MSDOS
+	long uinode;
+	int udev;
+#endif
+	int url;	/*0=sequential*/
+	flag useek;	/*true=can backspace, use dir, ...*/
+	flag ufmt;
+	flag urw;	/* (1 for can read) | (2 for can write) */
+	flag ublnk;
+	flag uend;
+	flag uwrt;	/*last io was write*/
+	flag uscrtch;
+} unit;
+
+#undef Void
+#ifdef KR_headers
+#define Void /*void*/
+extern int (*f__getn)();	/* for formatted input */
+extern void (*f__putn)();	/* for formatted output */
+extern void x_putc();
+extern long f__inode();
+extern VOID sig_die();
+extern int (*f__donewrec)(), t_putc(), x_wSL();
+extern int c_sfe(), err__fl(), xrd_SL(), f__putbuf();
+#else
+#define Void void
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int (*f__getn)(void);	/* for formatted input */
+extern void (*f__putn)(int);	/* for formatted output */
+extern void x_putc(int);
+extern long f__inode(char*,int*);
+extern void sig_die(const char*,int);
+extern void f__fatal(int, const char*);
+extern int t_runc(alist*);
+extern int f__nowreading(unit*), f__nowwriting(unit*);
+extern int fk_open(int,int,ftnint);
+extern int en_fio(void);
+extern void f_init(void);
+extern int (*f__donewrec)(void), t_putc(int), x_wSL(void);
+extern void b_char(const char*,char*,ftnlen), g_char(const char*,ftnlen,char*);
+extern int c_sfe(cilist*), z_rnew(void);
+extern int err__fl(int,int,const char*);
+extern int xrd_SL(void);
+extern int f__putbuf(int);
+#endif
+extern flag f__init;
+extern cilist *f__elist;	/*active external io list*/
+extern flag f__reading,f__external,f__sequential,f__formatted;
+extern int (*f__doend)(Void);
+extern FILE *f__cf;	/*current file*/
+extern unit *f__curunit;	/*current unit*/
+extern unit f__units[];
+#define err(f,m,s) {if(f) errno= m; else f__fatal(m,s); return(m);}
+#define errfl(f,m,s) return err__fl((int)f,m,s)
+
+/*Table sizes*/
+#define MXUNIT 100
+
+extern int f__recpos;	/*position in current record*/
+extern OFF_T f__cursor;	/* offset to move to */
+extern OFF_T f__hiwater;	/* so TL doesn't confuse us */
+#ifdef __cplusplus
+	}
+#endif
+
+#define WRITE	1
+#define READ	2
+#define SEQ	3
+#define DIR	4
+#define FMT	5
+#define UNF	6
+#define EXT	7
+#define INT	8
+
+#define buf_end(x) (x->_flag & _IONBF ? x->_ptr : x->_base + BUFSIZ)

+ 530 - 0
min-dgels/additional/fmt.c

@@ -0,0 +1,530 @@
+#include "f2c.h"
+#include "fio.h"
+#include "fmt.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define skip(s) while(*s==' ') s++
+#ifdef interdata
+#define SYLMX 300
+#endif
+#ifdef pdp11
+#define SYLMX 300
+#endif
+#ifdef vax
+#define SYLMX 300
+#endif
+#ifndef SYLMX
+#define SYLMX 300
+#endif
+#define GLITCH '\2'
+	/* special quote character for stu */
+extern flag f__cblank,f__cplus;	/*blanks in I and compulsory plus*/
+static struct syl f__syl[SYLMX];
+int f__parenlvl,f__pc,f__revloc;
+#ifdef KR_headers
+#define Const /*nothing*/
+#else
+#define Const const
+#endif
+
+ static
+#ifdef KR_headers
+char *ap_end(s) char *s;
+#else
+const char *ap_end(const char *s)
+#endif
+{	char quote;
+	quote= *s++;
+	for(;*s;s++)
+	{	if(*s!=quote) continue;
+		if(*++s!=quote) return(s);
+	}
+	if(f__elist->cierr) {
+		errno = 100;
+		return(NULL);
+	}
+	f__fatal(100, "bad string");
+	/*NOTREACHED*/ return 0;
+}
+ static int
+#ifdef KR_headers
+op_gen(a,b,c,d)
+#else
+op_gen(int a, int b, int c, int d)
+#endif
+{	struct syl *p= &f__syl[f__pc];
+	if(f__pc>=SYLMX)
+	{	fprintf(stderr,"format too complicated:\n");
+		sig_die(f__fmtbuf, 1);
+	}
+	p->op=a;
+	p->p1=b;
+	p->p2.i[0]=c;
+	p->p2.i[1]=d;
+	return(f__pc++);
+}
+#ifdef KR_headers
+static char *f_list();
+static char *gt_num(s,n,n1) char *s; int *n, n1;
+#else
+static const char *f_list(const char*);
+static const char *gt_num(const char *s, int *n, int n1)
+#endif
+{	int m=0,f__cnt=0;
+	char c;
+	for(c= *s;;c = *s)
+	{	if(c==' ')
+		{	s++;
+			continue;
+		}
+		if(c>'9' || c<'0') break;
+		m=10*m+c-'0';
+		f__cnt++;
+		s++;
+	}
+	if(f__cnt==0) {
+		if (!n1)
+			s = 0;
+		*n=n1;
+		}
+	else *n=m;
+	return(s);
+}
+
+ static
+#ifdef KR_headers
+char *f_s(s,curloc) char *s;
+#else
+const char *f_s(const char *s, int curloc)
+#endif
+{
+	skip(s);
+	if(*s++!='(')
+	{
+		return(NULL);
+	}
+	if(f__parenlvl++ ==1) f__revloc=curloc;
+	if(op_gen(RET1,curloc,0,0)<0 ||
+		(s=f_list(s))==NULL)
+	{
+		return(NULL);
+	}
+	skip(s);
+	return(s);
+}
+
+ static int
+#ifdef KR_headers
+ne_d(s,p) char *s,**p;
+#else
+ne_d(const char *s, const char **p)
+#endif
+{	int n,x,sign=0;
+	struct syl *sp;
+	switch(*s)
+	{
+	default:
+		return(0);
+	case ':': (void) op_gen(COLON,0,0,0); break;
+	case '$':
+		(void) op_gen(NONL, 0, 0, 0); break;
+	case 'B':
+	case 'b':
+		if(*++s=='z' || *s == 'Z') (void) op_gen(BZ,0,0,0);
+		else (void) op_gen(BN,0,0,0);
+		break;
+	case 'S':
+	case 's':
+		if(*(s+1)=='s' || *(s+1) == 'S')
+		{	x=SS;
+			s++;
+		}
+		else if(*(s+1)=='p' || *(s+1) == 'P')
+		{	x=SP;
+			s++;
+		}
+		else x=S;
+		(void) op_gen(x,0,0,0);
+		break;
+	case '/': (void) op_gen(SLASH,0,0,0); break;
+	case '-': sign=1;
+	case '+':	s++;	/*OUTRAGEOUS CODING TRICK*/
+	case '0': case '1': case '2': case '3': case '4':
+	case '5': case '6': case '7': case '8': case '9':
+		if (!(s=gt_num(s,&n,0))) {
+ bad:			*p = 0;
+			return 1;
+			}
+		switch(*s)
+		{
+		default:
+			return(0);
+		case 'P':
+		case 'p': if(sign) n= -n; (void) op_gen(P,n,0,0); break;
+		case 'X':
+		case 'x': (void) op_gen(X,n,0,0); break;
+		case 'H':
+		case 'h':
+			sp = &f__syl[op_gen(H,n,0,0)];
+			sp->p2.s = (char*)s + 1;
+			s+=n;
+			break;
+		}
+		break;
+	case GLITCH:
+	case '"':
+	case '\'':
+		sp = &f__syl[op_gen(APOS,0,0,0)];
+		sp->p2.s = (char*)s;
+		if((*p = ap_end(s)) == NULL)
+			return(0);
+		return(1);
+	case 'T':
+	case 't':
+		if(*(s+1)=='l' || *(s+1) == 'L')
+		{	x=TL;
+			s++;
+		}
+		else if(*(s+1)=='r'|| *(s+1) == 'R')
+		{	x=TR;
+			s++;
+		}
+		else x=T;
+		if (!(s=gt_num(s+1,&n,0)))
+			goto bad;
+		s--;
+		(void) op_gen(x,n,0,0);
+		break;
+	case 'X':
+	case 'x': (void) op_gen(X,1,0,0); break;
+	case 'P':
+	case 'p': (void) op_gen(P,1,0,0); break;
+	}
+	s++;
+	*p=s;
+	return(1);
+}
+
+ static int
+#ifdef KR_headers
+e_d(s,p) char *s,**p;
+#else
+e_d(const char *s, const char **p)
+#endif
+{	int i,im,n,w,d,e,found=0,x=0;
+	Const char *sv=s;
+	s=gt_num(s,&n,1);
+	(void) op_gen(STACK,n,0,0);
+	switch(*s++)
+	{
+	default: break;
+	case 'E':
+	case 'e':	x=1;
+	case 'G':
+	case 'g':
+		found=1;
+		if (!(s=gt_num(s,&w,0))) {
+ bad:
+			*p = 0;
+			return 1;
+			}
+		if(w==0) break;
+		if(*s=='.') {
+			if (!(s=gt_num(s+1,&d,0)))
+				goto bad;
+			}
+		else d=0;
+		if(*s!='E' && *s != 'e')
+			(void) op_gen(x==1?E:G,w,d,0);	/* default is Ew.dE2 */
+		else {
+			if (!(s=gt_num(s+1,&e,0)))
+				goto bad;
+			(void) op_gen(x==1?EE:GE,w,d,e);
+			}
+		break;
+	case 'O':
+	case 'o':
+		i = O;
+		im = OM;
+		goto finish_I;
+	case 'Z':
+	case 'z':
+		i = Z;
+		im = ZM;
+		goto finish_I;
+	case 'L':
+	case 'l':
+		found=1;
+		if (!(s=gt_num(s,&w,0)))
+			goto bad;
+		if(w==0) break;
+		(void) op_gen(L,w,0,0);
+		break;
+	case 'A':
+	case 'a':
+		found=1;
+		skip(s);
+		if(*s>='0' && *s<='9')
+		{	s=gt_num(s,&w,1);
+			if(w==0) break;
+			(void) op_gen(AW,w,0,0);
+			break;
+		}
+		(void) op_gen(A,0,0,0);
+		break;
+	case 'F':
+	case 'f':
+		if (!(s=gt_num(s,&w,0)))
+			goto bad;
+		found=1;
+		if(w==0) break;
+		if(*s=='.') {
+			if (!(s=gt_num(s+1,&d,0)))
+				goto bad;
+			}
+		else d=0;
+		(void) op_gen(F,w,d,0);
+		break;
+	case 'D':
+	case 'd':
+		found=1;
+		if (!(s=gt_num(s,&w,0)))
+			goto bad;
+		if(w==0) break;
+		if(*s=='.') {
+			if (!(s=gt_num(s+1,&d,0)))
+				goto bad;
+			}
+		else d=0;
+		(void) op_gen(D,w,d,0);
+		break;
+	case 'I':
+	case 'i':
+		i = I;
+		im = IM;
+ finish_I:
+		if (!(s=gt_num(s,&w,0)))
+			goto bad;
+		found=1;
+		if(w==0) break;
+		if(*s!='.')
+		{	(void) op_gen(i,w,0,0);
+			break;
+		}
+		if (!(s=gt_num(s+1,&d,0)))
+			goto bad;
+		(void) op_gen(im,w,d,0);
+		break;
+	}
+	if(found==0)
+	{	f__pc--; /*unSTACK*/
+		*p=sv;
+		return(0);
+	}
+	*p=s;
+	return(1);
+}
+ static
+#ifdef KR_headers
+char *i_tem(s) char *s;
+#else
+const char *i_tem(const char *s)
+#endif
+{	const char *t;
+	int n,curloc;
+	if(*s==')') return(s);
+	if(ne_d(s,&t)) return(t);
+	if(e_d(s,&t)) return(t);
+	s=gt_num(s,&n,1);
+	if((curloc=op_gen(STACK,n,0,0))<0) return(NULL);
+	return(f_s(s,curloc));
+}
+
+ static
+#ifdef KR_headers
+char *f_list(s) char *s;
+#else
+const char *f_list(const char *s)
+#endif
+{
+	for(;*s!=0;)
+	{	skip(s);
+		if((s=i_tem(s))==NULL) return(NULL);
+		skip(s);
+		if(*s==',') s++;
+		else if(*s==')')
+		{	if(--f__parenlvl==0)
+			{
+				(void) op_gen(REVERT,f__revloc,0,0);
+				return(++s);
+			}
+			(void) op_gen(GOTO,0,0,0);
+			return(++s);
+		}
+	}
+	return(NULL);
+}
+
+ int
+#ifdef KR_headers
+pars_f(s) char *s;
+#else
+pars_f(const char *s)
+#endif
+{
+	f__parenlvl=f__revloc=f__pc=0;
+	if(f_s(s,0) == NULL)
+	{
+		return(-1);
+	}
+	return(0);
+}
+#define STKSZ 10
+int f__cnt[STKSZ],f__ret[STKSZ],f__cp,f__rp;
+flag f__workdone, f__nonl;
+
+ static int
+#ifdef KR_headers
+type_f(n)
+#else
+type_f(int n)
+#endif
+{
+	switch(n)
+	{
+	default:
+		return(n);
+	case RET1:
+		return(RET1);
+	case REVERT: return(REVERT);
+	case GOTO: return(GOTO);
+	case STACK: return(STACK);
+	case X:
+	case SLASH:
+	case APOS: case H:
+	case T: case TL: case TR:
+		return(NED);
+	case F:
+	case I:
+	case IM:
+	case A: case AW:
+	case O: case OM:
+	case L:
+	case E: case EE: case D:
+	case G: case GE:
+	case Z: case ZM:
+		return(ED);
+	}
+}
+#ifdef KR_headers
+integer do_fio(number,ptr,len) ftnint *number; ftnlen len; char *ptr;
+#else
+integer do_fio(ftnint *number, char *ptr, ftnlen len)
+#endif
+{	struct syl *p;
+	int n,i;
+	for(i=0;i<*number;i++,ptr+=len)
+	{
+loop:	switch(type_f((p= &f__syl[f__pc])->op))
+	{
+	default:
+		fprintf(stderr,"unknown code in do_fio: %d\n%s\n",
+			p->op,f__fmtbuf);
+		err(f__elist->cierr,100,"do_fio");
+	case NED:
+		if((*f__doned)(p))
+		{	f__pc++;
+			goto loop;
+		}
+		f__pc++;
+		continue;
+	case ED:
+		if(f__cnt[f__cp]<=0)
+		{	f__cp--;
+			f__pc++;
+			goto loop;
+		}
+		if(ptr==NULL)
+			return((*f__doend)());
+		f__cnt[f__cp]--;
+		f__workdone=1;
+		if((n=(*f__doed)(p,ptr,len))>0)
+			errfl(f__elist->cierr,errno,"fmt");
+		if(n<0)
+			err(f__elist->ciend,(EOF),"fmt");
+		continue;
+	case STACK:
+		f__cnt[++f__cp]=p->p1;
+		f__pc++;
+		goto loop;
+	case RET1:
+		f__ret[++f__rp]=p->p1;
+		f__pc++;
+		goto loop;
+	case GOTO:
+		if(--f__cnt[f__cp]<=0)
+		{	f__cp--;
+			f__rp--;
+			f__pc++;
+			goto loop;
+		}
+		f__pc=1+f__ret[f__rp--];
+		goto loop;
+	case REVERT:
+		f__rp=f__cp=0;
+		f__pc = p->p1;
+		if(ptr==NULL)
+			return((*f__doend)());
+		if(!f__workdone) return(0);
+		if((n=(*f__dorevert)()) != 0) return(n);
+		goto loop;
+	case COLON:
+		if(ptr==NULL)
+			return((*f__doend)());
+		f__pc++;
+		goto loop;
+	case NONL:
+		f__nonl = 1;
+		f__pc++;
+		goto loop;
+	case S:
+	case SS:
+		f__cplus=0;
+		f__pc++;
+		goto loop;
+	case SP:
+		f__cplus = 1;
+		f__pc++;
+		goto loop;
+	case P:	f__scale=p->p1;
+		f__pc++;
+		goto loop;
+	case BN:
+		f__cblank=0;
+		f__pc++;
+		goto loop;
+	case BZ:
+		f__cblank=1;
+		f__pc++;
+		goto loop;
+	}
+	}
+	return(0);
+}
+
+ int
+en_fio(Void)
+{	ftnint one=1;
+	return(do_fio(&one,(char *)NULL,(ftnint)0));
+}
+
+ VOID
+fmt_bg(Void)
+{
+	f__workdone=f__cp=f__rp=f__pc=f__cursor=0;
+	f__cnt[0]=f__ret[0]=0;
+}
+#ifdef __cplusplus
+}
+#endif

+ 105 - 0
min-dgels/additional/fmt.h

@@ -0,0 +1,105 @@
+struct syl
+{	int op;
+	int p1;
+	union { int i[2]; char *s;} p2;
+	};
+#define RET1 1
+#define REVERT 2
+#define GOTO 3
+#define X 4
+#define SLASH 5
+#define STACK 6
+#define I 7
+#define ED 8
+#define NED 9
+#define IM 10
+#define APOS 11
+#define H 12
+#define TL 13
+#define TR 14
+#define T 15
+#define COLON 16
+#define S 17
+#define SP 18
+#define SS 19
+#define P 20
+#define BN 21
+#define BZ 22
+#define F 23
+#define E 24
+#define EE 25
+#define D 26
+#define G 27
+#define GE 28
+#define L 29
+#define A 30
+#define AW 31
+#define O 32
+#define NONL 33
+#define OM 34
+#define Z 35
+#define ZM 36
+typedef union
+{	real pf;
+	doublereal pd;
+} ufloat;
+typedef union
+{	short is;
+#ifndef KR_headers
+	signed
+#endif
+		char ic;
+	integer il;
+#ifdef Allow_TYQUAD
+	longint ili;
+#endif
+} Uint;
+#ifdef KR_headers
+extern int (*f__doed)(),(*f__doned)();
+extern int (*f__dorevert)();
+extern int rd_ed(),rd_ned();
+extern int w_ed(),w_ned();
+extern int signbit_f2c();
+extern char *f__fmtbuf;
+#else
+#ifdef __cplusplus
+extern "C" {
+#define Cextern extern "C"
+#else
+#define Cextern extern
+#endif
+extern const char *f__fmtbuf;
+extern int (*f__doed)(struct syl*, char*, ftnlen),(*f__doned)(struct syl*);
+extern int (*f__dorevert)(void);
+extern void fmt_bg(void);
+extern int pars_f(const char*);
+extern int rd_ed(struct syl*, char*, ftnlen),rd_ned(struct syl*);
+extern int signbit_f2c(double*);
+extern int w_ed(struct syl*, char*, ftnlen),w_ned(struct syl*);
+extern int wrt_E(ufloat*, int, int, int, ftnlen);
+extern int wrt_F(ufloat*, int, int, ftnlen);
+extern int wrt_L(Uint*, int, ftnlen);
+#endif
+extern int f__pc,f__parenlvl,f__revloc;
+extern flag f__cblank,f__cplus,f__workdone, f__nonl;
+extern int f__scale;
+#ifdef __cplusplus
+	}
+#endif
+#define GET(x) if((x=(*f__getn)())<0) return(x)
+#define VAL(x) (x!='\n'?x:' ')
+#define PUT(x) (*f__putn)(x)
+
+#undef TYQUAD
+#ifndef Allow_TYQUAD
+#undef longint
+#define longint long
+#else
+#define TYQUAD 14
+#endif
+
+#ifdef KR_headers
+extern char *f__icvt();
+#else
+Cextern char *f__icvt(longint, int*, int*, int);
+#endif

+ 166 - 0
min-dgels/additional/ieeeck.c

@@ -0,0 +1,166 @@
+/* ieeeck.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+integer ieeeck_(integer *ispec, real *zero, real *one)
+{
+    /* System generated locals */
+    integer ret_val;
+
+    /* Local variables */
+    real nan1, nan2, nan3, nan4, nan5, nan6, neginf, posinf, negzro, newzro;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  IEEECK is called from the ILAENV to verify that Infinity and */
+/*  possibly NaN arithmetic is safe (i.e. will not trap). */
+
+/*  Arguments */
+/*  ========= */
+
+/*  ISPEC   (input) INTEGER */
+/*          Specifies whether to test just for inifinity arithmetic */
+/*          or whether to test for infinity and NaN arithmetic. */
+/*          = 0: Verify infinity arithmetic only. */
+/*          = 1: Verify infinity and NaN arithmetic. */
+
+/*  ZERO    (input) REAL */
+/*          Must contain the value 0.0 */
+/*          This is passed to prevent the compiler from optimizing */
+/*          away this code. */
+
+/*  ONE     (input) REAL */
+/*          Must contain the value 1.0 */
+/*          This is passed to prevent the compiler from optimizing */
+/*          away this code. */
+
+/*  RETURN VALUE:  INTEGER */
+/*          = 0:  Arithmetic failed to produce the correct answers */
+/*          = 1:  Arithmetic produced the correct answers */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Executable Statements .. */
+    ret_val = 1;
+
+    posinf = *one / *zero;
+    if (posinf <= *one) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    neginf = -(*one) / *zero;
+    if (neginf >= *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    negzro = *one / (neginf + *one);
+    if (negzro != *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    neginf = *one / negzro;
+    if (neginf >= *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    newzro = negzro + *zero;
+    if (newzro != *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    posinf = *one / newzro;
+    if (posinf <= *one) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    neginf *= posinf;
+    if (neginf >= *zero) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    posinf *= posinf;
+    if (posinf <= *one) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+
+
+
+/*     Return if we were only asked to check infinity arithmetic */
+
+    if (*ispec == 0) {
+	return ret_val;
+    }
+
+    nan1 = posinf + neginf;
+
+    nan2 = posinf / neginf;
+
+    nan3 = posinf / posinf;
+
+    nan4 = posinf * *zero;
+
+    nan5 = neginf * negzro;
+
+    nan6 = nan5 * 0.f;
+
+    if (nan1 == nan1) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan2 == nan2) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan3 == nan3) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan4 == nan4) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan5 == nan5) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    if (nan6 == nan6) {
+	ret_val = 0;
+	return ret_val;
+    }
+
+    return ret_val;
+} /* ieeeck_ */

+ 88 - 0
min-dgels/additional/iladlc.c

@@ -0,0 +1,88 @@
+/* iladlc.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+integer iladlc_(integer *m, integer *n, doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1;
+
+    /* Local variables */
+    integer i__;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2.1)                        -- */
+
+/*  -- April 2009                                                      -- */
+
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ILADLC scans A for its last non-zero column. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A. */
+
+/*  A       (input) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          The m by n matrix A. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A. LDA >= max(1,M). */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Quick test for the common case where one corner is non-zero. */
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*n == 0) {
+	ret_val = *n;
+    } else if (a[*n * a_dim1 + 1] != 0. || a[*m + *n * a_dim1] != 0.) {
+	ret_val = *n;
+    } else {
+/*     Now scan each column from the end, returning with the first non-zero. */
+	for (ret_val = *n; ret_val >= 1; --ret_val) {
+	    i__1 = *m;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		if (a[i__ + ret_val * a_dim1] != 0.) {
+		    return ret_val;
+		}
+	    }
+	}
+    }
+    return ret_val;
+} /* iladlc_ */

+ 90 - 0
min-dgels/additional/iladlr.c

@@ -0,0 +1,90 @@
+/* iladlr.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+integer iladlr_(integer *m, integer *n, doublereal *a, integer *lda)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, ret_val, i__1;
+
+    /* Local variables */
+    integer i__, j;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2.1)                        -- */
+
+/*  -- April 2009                                                      -- */
+
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ILADLR scans A for its last non-zero row. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  M       (input) INTEGER */
+/*          The number of rows of the matrix A. */
+
+/*  N       (input) INTEGER */
+/*          The number of columns of the matrix A. */
+
+/*  A       (input) DOUBLE PRECISION array, dimension (LDA,N) */
+/*          The m by n matrix A. */
+
+/*  LDA     (input) INTEGER */
+/*          The leading dimension of the array A. LDA >= max(1,M). */
+
+/*  ===================================================================== */
+
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Quick test for the common case where one corner is non-zero. */
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+
+    /* Function Body */
+    if (*m == 0) {
+	ret_val = *m;
+    } else if (a[*m + a_dim1] != 0. || a[*m + *n * a_dim1] != 0.) {
+	ret_val = *m;
+    } else {
+/*     Scan up each column tracking the last zero row seen. */
+	ret_val = 0;
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    for (i__ = *m; i__ >= 1; --i__) {
+		if (a[i__ + j * a_dim1] != 0.) {
+		    break;
+		}
+	    }
+	    ret_val = max(ret_val,i__);
+	}
+    }
+    return ret_val;
+} /* iladlr_ */

+ 654 - 0
min-dgels/additional/ilaenv.c

@@ -0,0 +1,654 @@
+/* ilaenv.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+#include "string.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static real c_b163 = 0.f;
+static real c_b164 = 1.f;
+static integer c__0 = 0;
+
+integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, 
+	integer *n2, integer *n3, integer *n4)
+{
+    /* System generated locals */
+    integer ret_val;
+
+    /* Builtin functions */
+    /* Subroutine */ int s_copy(char *, char *, ftnlen, ftnlen);
+    integer s_cmp(char *, char *, ftnlen, ftnlen);
+
+    /* Local variables */
+    integer i__;
+    char c1[1], c2[1], c3[1], c4[1];
+    integer ic, nb, iz, nx;
+    logical cname;
+    integer nbmin;
+    logical sname;
+    extern integer ieeeck_(integer *, real *, real *);
+    char subnam[1];
+    extern integer iparmq_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *);
+
+    ftnlen name_len, opts_len;
+
+    name_len = strlen (name__);
+    opts_len = strlen (opts);
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     January 2007 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  ILAENV is called from the LAPACK routines to choose problem-dependent */
+/*  parameters for the local environment.  See ISPEC for a description of */
+/*  the parameters. */
+
+/*  ILAENV returns an INTEGER */
+/*  if ILAENV >= 0: ILAENV returns the value of the parameter specified by ISPEC */
+/*  if ILAENV < 0:  if ILAENV = -k, the k-th argument had an illegal value. */
+
+/*  This version provides a set of parameters which should give good, */
+/*  but not optimal, performance on many of the currently available */
+/*  computers.  Users are encouraged to modify this subroutine to set */
+/*  the tuning parameters for their particular machine using the option */
+/*  and problem size information in the arguments. */
+
+/*  This routine will not function correctly if it is converted to all */
+/*  lower case.  Converting it to all upper case is allowed. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  ISPEC   (input) INTEGER */
+/*          Specifies the parameter to be returned as the value of */
+/*          ILAENV. */
+/*          = 1: the optimal blocksize; if this value is 1, an unblocked */
+/*               algorithm will give the best performance. */
+/*          = 2: the minimum block size for which the block routine */
+/*               should be used; if the usable block size is less than */
+/*               this value, an unblocked routine should be used. */
+/*          = 3: the crossover point (in a block routine, for N less */
+/*               than this value, an unblocked routine should be used) */
+/*          = 4: the number of shifts, used in the nonsymmetric */
+/*               eigenvalue routines (DEPRECATED) */
+/*          = 5: the minimum column dimension for blocking to be used; */
+/*               rectangular blocks must have dimension at least k by m, */
+/*               where k is given by ILAENV(2,...) and m by ILAENV(5,...) */
+/*          = 6: the crossover point for the SVD (when reducing an m by n */
+/*               matrix to bidiagonal form, if max(m,n)/min(m,n) exceeds */
+/*               this value, a QR factorization is used first to reduce */
+/*               the matrix to a triangular form.) */
+/*          = 7: the number of processors */
+/*          = 8: the crossover point for the multishift QR method */
+/*               for nonsymmetric eigenvalue problems (DEPRECATED) */
+/*          = 9: maximum size of the subproblems at the bottom of the */
+/*               computation tree in the divide-and-conquer algorithm */
+/*               (used by xGELSD and xGESDD) */
+/*          =10: ieee NaN arithmetic can be trusted not to trap */
+/*          =11: infinity arithmetic can be trusted not to trap */
+/*          12 <= ISPEC <= 16: */
+/*               xHSEQR or one of its subroutines, */
+/*               see IPARMQ for detailed explanation */
+
+/*  NAME    (input) CHARACTER*(*) */
+/*          The name of the calling subroutine, in either upper case or */
+/*          lower case. */
+
+/*  OPTS    (input) CHARACTER*(*) */
+/*          The character options to the subroutine NAME, concatenated */
+/*          into a single character string.  For example, UPLO = 'U', */
+/*          TRANS = 'T', and DIAG = 'N' for a triangular routine would */
+/*          be specified as OPTS = 'UTN'. */
+
+/*  N1      (input) INTEGER */
+/*  N2      (input) INTEGER */
+/*  N3      (input) INTEGER */
+/*  N4      (input) INTEGER */
+/*          Problem dimensions for the subroutine NAME; these may not all */
+/*          be required. */
+
+/*  Further Details */
+/*  =============== */
+
+/*  The following conventions have been used when calling ILAENV from the */
+/*  LAPACK routines: */
+/*  1)  OPTS is a concatenation of all of the character options to */
+/*      subroutine NAME, in the same order that they appear in the */
+/*      argument list for NAME, even if they are not used in determining */
+/*      the value of the parameter specified by ISPEC. */
+/*  2)  The problem dimensions N1, N2, N3, N4 are specified in the order */
+/*      that they appear in the argument list for NAME.  N1 is used */
+/*      first, N2 second, and so on, and unused problem dimensions are */
+/*      passed a value of -1. */
+/*  3)  The parameter value returned by ILAENV is checked for validity in */
+/*      the calling subroutine.  For example, ILAENV is used to retrieve */
+/*      the optimal blocksize for STRTRI as follows: */
+
+/*      NB = ILAENV( 1, 'STRTRI', UPLO // DIAG, N, -1, -1, -1 ) */
+/*      IF( NB.LE.1 ) NB = MAX( 1, N ) */
+
+/*  ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. External Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    switch (*ispec) {
+	case 1:  goto L10;
+	case 2:  goto L10;
+	case 3:  goto L10;
+	case 4:  goto L80;
+	case 5:  goto L90;
+	case 6:  goto L100;
+	case 7:  goto L110;
+	case 8:  goto L120;
+	case 9:  goto L130;
+	case 10:  goto L140;
+	case 11:  goto L150;
+	case 12:  goto L160;
+	case 13:  goto L160;
+	case 14:  goto L160;
+	case 15:  goto L160;
+	case 16:  goto L160;
+    }
+
+/*     Invalid value for ISPEC */
+
+    ret_val = -1;
+    return ret_val;
+
+L10:
+
+/*     Convert NAME to upper case if the first character is lower case. */
+
+    ret_val = 1;
+    s_copy(subnam, name__, (ftnlen)1, name_len);
+    ic = *(unsigned char *)subnam;
+    iz = 'Z';
+    if (iz == 90 || iz == 122) {
+
+/*        ASCII character set */
+
+	if (ic >= 97 && ic <= 122) {
+	    *(unsigned char *)subnam = (char) (ic - 32);
+	    for (i__ = 2; i__ <= 6; ++i__) {
+		ic = *(unsigned char *)&subnam[i__ - 1];
+		if (ic >= 97 && ic <= 122) {
+		    *(unsigned char *)&subnam[i__ - 1] = (char) (ic - 32);
+		}
+/* L20: */
+	    }
+	}
+
+    } else if (iz == 233 || iz == 169) {
+
+/*        EBCDIC character set */
+
+	if (ic >= 129 && ic <= 137 || ic >= 145 && ic <= 153 || ic >= 162 && 
+		ic <= 169) {
+	    *(unsigned char *)subnam = (char) (ic + 64);
+	    for (i__ = 2; i__ <= 6; ++i__) {
+		ic = *(unsigned char *)&subnam[i__ - 1];
+		if (ic >= 129 && ic <= 137 || ic >= 145 && ic <= 153 || ic >= 
+			162 && ic <= 169) {
+		    *(unsigned char *)&subnam[i__ - 1] = (char) (ic + 64);
+		}
+/* L30: */
+	    }
+	}
+
+    } else if (iz == 218 || iz == 250) {
+
+/*        Prime machines:  ASCII+128 */
+
+	if (ic >= 225 && ic <= 250) {
+	    *(unsigned char *)subnam = (char) (ic - 32);
+	    for (i__ = 2; i__ <= 6; ++i__) {
+		ic = *(unsigned char *)&subnam[i__ - 1];
+		if (ic >= 225 && ic <= 250) {
+		    *(unsigned char *)&subnam[i__ - 1] = (char) (ic - 32);
+		}
+/* L40: */
+	    }
+	}
+    }
+
+    *(unsigned char *)c1 = *(unsigned char *)subnam;
+    sname = *(unsigned char *)c1 == 'S' || *(unsigned char *)c1 == 'D';
+    cname = *(unsigned char *)c1 == 'C' || *(unsigned char *)c1 == 'Z';
+    if (! (cname || sname)) {
+	return ret_val;
+    }
+    s_copy(c2, subnam + 1, (ftnlen)1, (ftnlen)2);
+    s_copy(c3, subnam + 3, (ftnlen)1, (ftnlen)3);
+    s_copy(c4, c3 + 1, (ftnlen)1, (ftnlen)2);
+
+    switch (*ispec) {
+	case 1:  goto L50;
+	case 2:  goto L60;
+	case 3:  goto L70;
+    }
+
+L50:
+
+/*     ISPEC = 1:  block size */
+
+/*     In these examples, separate code is provided for setting NB for */
+/*     real and complex.  We assume that NB will take the same value in */
+/*     single or double precision. */
+
+    nb = 1;
+
+    if (s_cmp(c2, "GE", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	} else if (s_cmp(c3, "QRF", (ftnlen)1, (ftnlen)3) == 0 || s_cmp(c3, 
+		"RQF", (ftnlen)1, (ftnlen)3) == 0 || s_cmp(c3, "LQF", (ftnlen)
+		1, (ftnlen)3) == 0 || s_cmp(c3, "QLF", (ftnlen)1, (ftnlen)3) 
+		== 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
+	} else if (s_cmp(c3, "HRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
+	} else if (s_cmp(c3, "BRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
+	} else if (s_cmp(c3, "TRI", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	}
+    } else if (s_cmp(c2, "PO", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	}
+    } else if (s_cmp(c2, "SY", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	} else if (sname && s_cmp(c3, "TRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    nb = 32;
+	} else if (sname && s_cmp(c3, "GST", (ftnlen)1, (ftnlen)3) == 0) {
+	    nb = 64;
+	}
+    } else if (cname && s_cmp(c2, "HE", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)1, (ftnlen)3) == 0) {
+	    nb = 64;
+	} else if (s_cmp(c3, "TRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    nb = 32;
+	} else if (s_cmp(c3, "GST", (ftnlen)1, (ftnlen)3) == 0) {
+	    nb = 64;
+	}
+    } else if (sname && s_cmp(c2, "OR", (ftnlen)1, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nb = 32;
+	    }
+	} else if (*(unsigned char *)c3 == 'M') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nb = 32;
+	    }
+	}
+    } else if (cname && s_cmp(c2, "UN", (ftnlen)1, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nb = 32;
+	    }
+	} else if (*(unsigned char *)c3 == 'M') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nb = 32;
+	    }
+	}
+    } else if (s_cmp(c2, "GB", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		if (*n4 <= 64) {
+		    nb = 1;
+		} else {
+		    nb = 32;
+		}
+	    } else {
+		if (*n4 <= 64) {
+		    nb = 1;
+		} else {
+		    nb = 32;
+		}
+	    }
+	}
+    } else if (s_cmp(c2, "PB", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		if (*n2 <= 64) {
+		    nb = 1;
+		} else {
+		    nb = 32;
+		}
+	    } else {
+		if (*n2 <= 64) {
+		    nb = 1;
+		} else {
+		    nb = 32;
+		}
+	    }
+	}
+    } else if (s_cmp(c2, "TR", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRI", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	}
+    } else if (s_cmp(c2, "LA", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "UUM", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 64;
+	    } else {
+		nb = 64;
+	    }
+	}
+    } else if (sname && s_cmp(c2, "ST", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "EBZ", (ftnlen)1, (ftnlen)3) == 0) {
+	    nb = 1;
+	}
+    }
+    ret_val = nb;
+    return ret_val;
+
+L60:
+
+/*     ISPEC = 2:  minimum block size */
+
+    nbmin = 2;
+    if (s_cmp(c2, "GE", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "QRF", (ftnlen)1, (ftnlen)3) == 0 || s_cmp(c3, "RQF", (
+		ftnlen)1, (ftnlen)3) == 0 || s_cmp(c3, "LQF", (ftnlen)1, (
+		ftnlen)3) == 0 || s_cmp(c3, "QLF", (ftnlen)1, (ftnlen)3) == 0)
+		 {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
+	} else if (s_cmp(c3, "HRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
+	} else if (s_cmp(c3, "BRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
+	} else if (s_cmp(c3, "TRI", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
+	}
+    } else if (s_cmp(c2, "SY", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRF", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nbmin = 8;
+	    } else {
+		nbmin = 8;
+	    }
+	} else if (sname && s_cmp(c3, "TRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    nbmin = 2;
+	}
+    } else if (cname && s_cmp(c2, "HE", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    nbmin = 2;
+	}
+    } else if (sname && s_cmp(c2, "OR", (ftnlen)1, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nbmin = 2;
+	    }
+	} else if (*(unsigned char *)c3 == 'M') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nbmin = 2;
+	    }
+	}
+    } else if (cname && s_cmp(c2, "UN", (ftnlen)1, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nbmin = 2;
+	    }
+	} else if (*(unsigned char *)c3 == 'M') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nbmin = 2;
+	    }
+	}
+    }
+    ret_val = nbmin;
+    return ret_val;
+
+L70:
+
+/*     ISPEC = 3:  crossover point */
+
+    nx = 0;
+    if (s_cmp(c2, "GE", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "QRF", (ftnlen)1, (ftnlen)3) == 0 || s_cmp(c3, "RQF", (
+		ftnlen)1, (ftnlen)3) == 0 || s_cmp(c3, "LQF", (ftnlen)1, (
+		ftnlen)3) == 0 || s_cmp(c3, "QLF", (ftnlen)1, (ftnlen)3) == 0)
+		 {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
+	} else if (s_cmp(c3, "HRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
+	} else if (s_cmp(c3, "BRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
+	}
+    } else if (s_cmp(c2, "SY", (ftnlen)1, (ftnlen)2) == 0) {
+	if (sname && s_cmp(c3, "TRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    nx = 32;
+	}
+    } else if (cname && s_cmp(c2, "HE", (ftnlen)1, (ftnlen)2) == 0) {
+	if (s_cmp(c3, "TRD", (ftnlen)1, (ftnlen)3) == 0) {
+	    nx = 32;
+	}
+    } else if (sname && s_cmp(c2, "OR", (ftnlen)1, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nx = 128;
+	    }
+	}
+    } else if (cname && s_cmp(c2, "UN", (ftnlen)1, (ftnlen)2) == 0) {
+	if (*(unsigned char *)c3 == 'G') {
+	    if (s_cmp(c4, "QR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "RQ", 
+		    (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "LQ", (ftnlen)1, (
+		    ftnlen)2) == 0 || s_cmp(c4, "QL", (ftnlen)1, (ftnlen)2) ==
+		     0 || s_cmp(c4, "HR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(
+		    c4, "TR", (ftnlen)1, (ftnlen)2) == 0 || s_cmp(c4, "BR", (
+		    ftnlen)1, (ftnlen)2) == 0) {
+		nx = 128;
+	    }
+	}
+    }
+    ret_val = nx;
+    return ret_val;
+
+L80:
+
+/*     ISPEC = 4:  number of shifts (used by xHSEQR) */
+
+    ret_val = 6;
+    return ret_val;
+
+L90:
+
+/*     ISPEC = 5:  minimum column dimension (not used) */
+
+    ret_val = 2;
+    return ret_val;
+
+L100:
+
+/*     ISPEC = 6:  crossover point for SVD (used by xGELSS and xGESVD) */
+
+    ret_val = (integer) ((real) min(*n1,*n2) * 1.6f);
+    return ret_val;
+
+L110:
+
+/*     ISPEC = 7:  number of processors (not used) */
+
+    ret_val = 1;
+    return ret_val;
+
+L120:
+
+/*     ISPEC = 8:  crossover point for multishift (used by xHSEQR) */
+
+    ret_val = 50;
+    return ret_val;
+
+L130:
+
+/*     ISPEC = 9:  maximum size of the subproblems at the bottom of the */
+/*                 computation tree in the divide-and-conquer algorithm */
+/*                 (used by xGELSD and xGESDD) */
+
+    ret_val = 25;
+    return ret_val;
+
+L140:
+
+/*     ISPEC = 10: ieee NaN arithmetic can be trusted not to trap */
+
+/*     ILAENV = 0 */
+    ret_val = 1;
+    if (ret_val == 1) {
+	ret_val = ieeeck_(&c__1, &c_b163, &c_b164);
+    }
+    return ret_val;
+
+L150:
+
+/*     ISPEC = 11: infinity arithmetic can be trusted not to trap */
+
+/*     ILAENV = 0 */
+    ret_val = 1;
+    if (ret_val == 1) {
+	ret_val = ieeeck_(&c__0, &c_b163, &c_b164);
+    }
+    return ret_val;
+
+L160:
+
+/*     12 <= ISPEC <= 16: xHSEQR or one of its subroutines. */
+
+    ret_val = iparmq_(ispec, name__, opts, n1, n2, n3, n4)
+	    ;
+    return ret_val;
+
+/*     End of ILAENV */
+
+} /* ilaenv_ */

+ 282 - 0
min-dgels/additional/iparmq.c

@@ -0,0 +1,282 @@
+/* iparmq.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+integer iparmq_(integer *ispec, char *name__, char *opts, integer *n, integer 
+	*ilo, integer *ihi, integer *lwork)
+{
+    /* System generated locals */
+    integer ret_val, i__1, i__2;
+    real r__1;
+
+    /* Builtin functions */
+    double log(doublereal);
+    integer i_nint(real *);
+
+    /* Local variables */
+    integer nh, ns;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*       This program sets problem and machine dependent parameters */
+/*       useful for xHSEQR and its subroutines. It is called whenever */
+/*       ILAENV is called with 12 <= ISPEC <= 16 */
+
+/*  Arguments */
+/*  ========= */
+
+/*       ISPEC  (input) integer scalar */
+/*              ISPEC specifies which tunable parameter IPARMQ should */
+/*              return. */
+
+/*              ISPEC=12: (INMIN)  Matrices of order nmin or less */
+/*                        are sent directly to xLAHQR, the implicit */
+/*                        double shift QR algorithm.  NMIN must be */
+/*                        at least 11. */
+
+/*              ISPEC=13: (INWIN)  Size of the deflation window. */
+/*                        This is best set greater than or equal to */
+/*                        the number of simultaneous shifts NS. */
+/*                        Larger matrices benefit from larger deflation */
+/*                        windows. */
+
+/*              ISPEC=14: (INIBL) Determines when to stop nibbling and */
+/*                        invest in an (expensive) multi-shift QR sweep. */
+/*                        If the aggressive early deflation subroutine */
+/*                        finds LD converged eigenvalues from an order */
+/*                        NW deflation window and LD.GT.(NW*NIBBLE)/100, */
+/*                        then the next QR sweep is skipped and early */
+/*                        deflation is applied immediately to the */
+/*                        remaining active diagonal block.  Setting */
+/*                        IPARMQ(ISPEC=14) = 0 causes TTQRE to skip a */
+/*                        multi-shift QR sweep whenever early deflation */
+/*                        finds a converged eigenvalue.  Setting */
+/*                        IPARMQ(ISPEC=14) greater than or equal to 100 */
+/*                        prevents TTQRE from skipping a multi-shift */
+/*                        QR sweep. */
+
+/*              ISPEC=15: (NSHFTS) The number of simultaneous shifts in */
+/*                        a multi-shift QR iteration. */
+
+/*              ISPEC=16: (IACC22) IPARMQ is set to 0, 1 or 2 with the */
+/*                        following meanings. */
+/*                        0:  During the multi-shift QR sweep, */
+/*                            xLAQR5 does not accumulate reflections and */
+/*                            does not use matrix-matrix multiply to */
+/*                            update the far-from-diagonal matrix */
+/*                            entries. */
+/*                        1:  During the multi-shift QR sweep, */
+/*                            xLAQR5 and/or xLAQRaccumulates reflections and uses */
+/*                            matrix-matrix multiply to update the */
+/*                            far-from-diagonal matrix entries. */
+/*                        2:  During the multi-shift QR sweep. */
+/*                            xLAQR5 accumulates reflections and takes */
+/*                            advantage of 2-by-2 block structure during */
+/*                            matrix-matrix multiplies. */
+/*                        (If xTRMM is slower than xGEMM, then */
+/*                        IPARMQ(ISPEC=16)=1 may be more efficient than */
+/*                        IPARMQ(ISPEC=16)=2 despite the greater level of */
+/*                        arithmetic work implied by the latter choice.) */
+
+/*       NAME    (input) character string */
+/*               Name of the calling subroutine */
+
+/*       OPTS    (input) character string */
+/*               This is a concatenation of the string arguments to */
+/*               TTQRE. */
+
+/*       N       (input) integer scalar */
+/*               N is the order of the Hessenberg matrix H. */
+
+/*       ILO     (input) INTEGER */
+/*       IHI     (input) INTEGER */
+/*               It is assumed that H is already upper triangular */
+/*               in rows and columns 1:ILO-1 and IHI+1:N. */
+
+/*       LWORK   (input) integer scalar */
+/*               The amount of workspace available. */
+
+/*  Further Details */
+/*  =============== */
+
+/*       Little is known about how best to choose these parameters. */
+/*       It is possible to use different values of the parameters */
+/*       for each of CHSEQR, DHSEQR, SHSEQR and ZHSEQR. */
+
+/*       It is probably best to choose different parameters for */
+/*       different matrices and different parameters at different */
+/*       times during the iteration, but this has not been */
+/*       implemented --- yet. */
+
+
+/*       The best choices of most of the parameters depend */
+/*       in an ill-understood way on the relative execution */
+/*       rate of xLAQR3 and xLAQR5 and on the nature of each */
+/*       particular eigenvalue problem.  Experiment may be the */
+/*       only practical way to determine which choices are most */
+/*       effective. */
+
+/*       Following is a list of default values supplied by IPARMQ. */
+/*       These defaults may be adjusted in order to attain better */
+/*       performance in any particular computational environment. */
+
+/*       IPARMQ(ISPEC=12) The xLAHQR vs xLAQR0 crossover point. */
+/*                        Default: 75. (Must be at least 11.) */
+
+/*       IPARMQ(ISPEC=13) Recommended deflation window size. */
+/*                        This depends on ILO, IHI and NS, the */
+/*                        number of simultaneous shifts returned */
+/*                        by IPARMQ(ISPEC=15).  The default for */
+/*                        (IHI-ILO+1).LE.500 is NS.  The default */
+/*                        for (IHI-ILO+1).GT.500 is 3*NS/2. */
+
+/*       IPARMQ(ISPEC=14) Nibble crossover point.  Default: 14. */
+
+/*       IPARMQ(ISPEC=15) Number of simultaneous shifts, NS. */
+/*                        a multi-shift QR iteration. */
+
+/*                        If IHI-ILO+1 is ... */
+
+/*                        greater than      ...but less    ... the */
+/*                        or equal to ...      than        default is */
+
+/*                                0               30       NS =   2+ */
+/*                               30               60       NS =   4+ */
+/*                               60              150       NS =  10 */
+/*                              150              590       NS =  ** */
+/*                              590             3000       NS =  64 */
+/*                             3000             6000       NS = 128 */
+/*                             6000             infinity   NS = 256 */
+
+/*                    (+)  By default matrices of this order are */
+/*                         passed to the implicit double shift routine */
+/*                         xLAHQR.  See IPARMQ(ISPEC=12) above.   These */
+/*                         values of NS are used only in case of a rare */
+/*                         xLAHQR failure. */
+
+/*                    (**) The asterisks (**) indicate an ad-hoc */
+/*                         function increasing from 10 to 64. */
+
+/*       IPARMQ(ISPEC=16) Select structured matrix multiply. */
+/*                        (See ISPEC=16 above for details.) */
+/*                        Default: 3. */
+
+/*     ================================================================ */
+/*     .. Parameters .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+    if (*ispec == 15 || *ispec == 13 || *ispec == 16) {
+
+/*        ==== Set the number simultaneous shifts ==== */
+
+	nh = *ihi - *ilo + 1;
+	ns = 2;
+	if (nh >= 30) {
+	    ns = 4;
+	}
+	if (nh >= 60) {
+	    ns = 10;
+	}
+	if (nh >= 150) {
+/* Computing MAX */
+	    r__1 = log((real) nh) / log(2.f);
+	    i__1 = 10, i__2 = nh / i_nint(&r__1);
+	    ns = max(i__1,i__2);
+	}
+	if (nh >= 590) {
+	    ns = 64;
+	}
+	if (nh >= 3000) {
+	    ns = 128;
+	}
+	if (nh >= 6000) {
+	    ns = 256;
+	}
+/* Computing MAX */
+	i__1 = 2, i__2 = ns - ns % 2;
+	ns = max(i__1,i__2);
+    }
+
+    if (*ispec == 12) {
+
+
+/*        ===== Matrices of order smaller than NMIN get sent */
+/*        .     to xLAHQR, the classic double shift algorithm. */
+/*        .     This must be at least 11. ==== */
+
+	ret_val = 75;
+
+    } else if (*ispec == 14) {
+
+/*        ==== INIBL: skip a multi-shift qr iteration and */
+/*        .    whenever aggressive early deflation finds */
+/*        .    at least (NIBBLE*(window size)/100) deflations. ==== */
+
+	ret_val = 14;
+
+    } else if (*ispec == 15) {
+
+/*        ==== NSHFTS: The number of simultaneous shifts ===== */
+
+	ret_val = ns;
+
+    } else if (*ispec == 13) {
+
+/*        ==== NW: deflation window size.  ==== */
+
+	if (nh <= 500) {
+	    ret_val = ns;
+	} else {
+	    ret_val = ns * 3 / 2;
+	}
+
+    } else if (*ispec == 16) {
+
+/*        ==== IACC22: Whether to accumulate reflections */
+/*        .     before updating the far-from-diagonal elements */
+/*        .     and whether to use 2-by-2 block structure while */
+/*        .     doing it.  A small amount of work could be saved */
+/*        .     by making this choice dependent also upon the */
+/*        .     NH=IHI-ILO+1. */
+
+	ret_val = 0;
+	if (ns >= 14) {
+	    ret_val = 1;
+	}
+	if (ns >= 14) {
+	    ret_val = 2;
+	}
+
+    } else {
+/*        ===== invalid value of ispec ===== */
+	ret_val = -1;
+
+    }
+
+/*     ==== End of IPARMQ ==== */
+
+    return ret_val;
+} /* iparmq_ */

+ 117 - 0
min-dgels/additional/lsame.c

@@ -0,0 +1,117 @@
+/* lsame.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+logical lsame_(char *ca, char *cb)
+{
+    /* System generated locals */
+    logical ret_val;
+
+    /* Local variables */
+    integer inta, intb, zcode;
+
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  LSAME returns .TRUE. if CA is the same letter as CB regardless of */
+/*  case. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  CA      (input) CHARACTER*1 */
+/*  CB      (input) CHARACTER*1 */
+/*          CA and CB specify the single characters to be compared. */
+
+/* ===================================================================== */
+
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+/*     Test if the characters are equal */
+
+    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
+    if (ret_val) {
+	return ret_val;
+    }
+
+/*     Now test for equivalence if both characters are alphabetic. */
+
+    zcode = 'Z';
+
+/*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
+/*     machines, on which ICHAR returns a value with bit 8 set. */
+/*     ICHAR('A') on Prime machines returns 193 which is the same as */
+/*     ICHAR('A') on an EBCDIC machine. */
+
+    inta = *(unsigned char *)ca;
+    intb = *(unsigned char *)cb;
+
+    if (zcode == 90 || zcode == 122) {
+
+/*        ASCII is assumed - ZCODE is the ASCII code of either lower or */
+/*        upper case 'Z'. */
+
+	if (inta >= 97 && inta <= 122) {
+	    inta += -32;
+	}
+	if (intb >= 97 && intb <= 122) {
+	    intb += -32;
+	}
+
+    } else if (zcode == 233 || zcode == 169) {
+
+/*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
+/*        upper case 'Z'. */
+
+	if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta 
+		>= 162 && inta <= 169) {
+	    inta += 64;
+	}
+	if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb 
+		>= 162 && intb <= 169) {
+	    intb += 64;
+	}
+
+    } else if (zcode == 218 || zcode == 250) {
+
+/*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
+/*        plus 128 of either lower or upper case 'Z'. */
+
+	if (inta >= 225 && inta <= 250) {
+	    inta += -32;
+	}
+	if (intb >= 225 && intb <= 250) {
+	    intb += -32;
+	}
+    }
+    ret_val = inta == intb;
+
+/*     RETURN */
+
+/*     End of LSAME */
+
+    return ret_val;
+} /* lsame_ */

+ 8 - 0
min-dgels/additional/mindgels.h

@@ -0,0 +1,8 @@
+#ifndef DGELS_H 
+#define DGELS_H
+
+#include "f2c.h"
+
+int dgels_(char *trans, integer *m, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *work, integer *lwork, integer *info);
+
+#endif

+ 41 - 0
min-dgels/additional/pow_di.c

@@ -0,0 +1,41 @@
+#include "f2c.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef KR_headers
+double pow_di(ap, bp) doublereal *ap; integer *bp;
+#else
+double pow_di(doublereal *ap, integer *bp)
+#endif
+{
+double pow, x;
+integer n;
+unsigned long u;
+
+pow = 1;
+x = *ap;
+n = *bp;
+
+if(n != 0)
+	{
+	if(n < 0)
+		{
+		n = -n;
+		x = 1/x;
+		}
+	for(u = n; ; )
+		{
+		if(u & 01)
+			pow *= x;
+		if(u >>= 1)
+			x *= x;
+		else
+			break;
+		}
+	}
+return(pow);
+}
+#ifdef __cplusplus
+}
+#endif

+ 86 - 0
min-dgels/additional/s_cat.c

@@ -0,0 +1,86 @@
+/* Unless compiled with -DNO_OVERWRITE, this variant of s_cat allows the
+ * target of a concatenation to appear on its right-hand side (contrary
+ * to the Fortran 77 Standard, but in accordance with Fortran 90).
+ */
+
+#include "f2c.h"
+#ifndef NO_OVERWRITE
+#include "stdio.h"
+#undef abs
+#ifdef KR_headers
+ extern char *F77_aloc();
+ extern void free();
+ extern void exit_();
+#else
+#undef min
+#undef max
+#include "stdlib.h"
+extern
+#ifdef __cplusplus
+	"C"
+#endif
+	char *F77_aloc(ftnlen, const char*);
+#endif
+#include "string.h"
+#endif /* NO_OVERWRITE */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ VOID
+#ifdef KR_headers
+s_cat(lp, rpp, rnp, np, ll) char *lp, *rpp[]; ftnint rnp[], *np; ftnlen ll;
+#else
+s_cat(char *lp, char *rpp[], ftnint rnp[], ftnint *np, ftnlen ll)
+#endif
+{
+	ftnlen i, nc;
+	char *rp;
+	ftnlen n = *np;
+#ifndef NO_OVERWRITE
+	ftnlen L, m;
+	char *lp0, *lp1;
+
+	lp0 = 0;
+	lp1 = lp;
+	L = ll;
+	i = 0;
+	while(i < n) {
+		rp = rpp[i];
+		m = rnp[i++];
+		if (rp >= lp1 || rp + m <= lp) {
+			if ((L -= m) <= 0) {
+				n = i;
+				break;
+				}
+			lp1 += m;
+			continue;
+			}
+		lp0 = lp;
+		lp = lp1 = F77_aloc(L = ll, "s_cat");
+		break;
+		}
+	lp1 = lp;
+#endif /* NO_OVERWRITE */
+	for(i = 0 ; i < n ; ++i) {
+		nc = ll;
+		if(rnp[i] < nc)
+			nc = rnp[i];
+		ll -= nc;
+		rp = rpp[i];
+		while(--nc >= 0)
+			*lp++ = *rp++;
+		}
+	while(--ll >= 0)
+		*lp++ = ' ';
+#ifndef NO_OVERWRITE
+	if (lp0) {
+		memcpy(lp0, lp1, L);
+		free(lp1);
+		}
+#endif
+	}
+#ifdef __cplusplus
+}
+#endif

+ 66 - 0
min-dgels/additional/sysdep1.h

@@ -0,0 +1,66 @@
+#ifndef SYSDEP_H_INCLUDED
+#define SYSDEP_H_INCLUDED
+#undef USE_LARGEFILE
+#ifndef NO_LONG_LONG
+
+#ifdef __sun__
+#define USE_LARGEFILE
+#define OFF_T off64_t
+#endif
+
+#ifdef __linux__
+#define USE_LARGEFILE
+#define OFF_T __off64_t
+#endif
+
+#ifdef _AIX43
+#define _LARGE_FILES
+#define _LARGE_FILE_API
+#define USE_LARGEFILE
+#endif /*_AIX43*/
+
+#ifdef __hpux
+#define _FILE64
+#define _LARGEFILE64_SOURCE
+#define USE_LARGEFILE
+#endif /*__hpux*/
+
+#ifdef __sgi
+#define USE_LARGEFILE
+#endif /*__sgi*/
+
+#ifdef __FreeBSD__
+#define OFF_T off_t
+#define FSEEK fseeko
+#define FTELL ftello
+#endif
+
+#ifdef USE_LARGEFILE
+#ifndef OFF_T
+#define OFF_T off64_t
+#endif
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#define FOPEN fopen64
+#define FREOPEN freopen64
+#define FSEEK fseeko64
+#define FSTAT fstat64
+#define FTELL ftello64
+#define FTRUNCATE ftruncate64
+#define STAT stat64
+#define STAT_ST stat64
+#endif /*USE_LARGEFILE*/
+#endif /*NO_LONG_LONG*/
+
+#ifndef NON_UNIX_STDIO
+#ifndef USE_LARGEFILE
+#define _INCLUDE_POSIX_SOURCE	/* for HP-UX */
+#define _INCLUDE_XOPEN_SOURCE	/* for HP-UX */
+#include "sys/types.h"
+#include "sys/stat.h"
+#endif
+#endif
+
+#endif /*SYSDEP_H_INCLUDED*/

+ 78 - 0
min-dgels/additional/wsfe.c

@@ -0,0 +1,78 @@
+/*write sequential formatted external*/
+#include "f2c.h"
+#include "fio.h"
+#include "fmt.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ int
+x_wSL(Void)
+{
+	int n = f__putbuf('\n');
+	f__hiwater = f__recpos = f__cursor = 0;
+	return(n == 0);
+}
+
+ static int
+xw_end(Void)
+{
+	int n;
+
+	if(f__nonl) {
+		f__putbuf(n = 0);
+		fflush(f__cf);
+		}
+	else
+		n = f__putbuf('\n');
+	f__hiwater = f__recpos = f__cursor = 0;
+	return n;
+}
+
+ static int
+xw_rev(Void)
+{
+	int n = 0;
+	if(f__workdone) {
+		n = f__putbuf('\n');
+		f__workdone = 0;
+		}
+	f__hiwater = f__recpos = f__cursor = 0;
+	return n;
+}
+
+#ifdef KR_headers
+integer s_wsfe(a) cilist *a;	/*start*/
+#else
+integer s_wsfe(cilist *a)	/*start*/
+#endif
+{	int n;
+	if(!f__init) f_init();
+	f__reading=0;
+	f__sequential=1;
+	f__formatted=1;
+	f__external=1;
+	if(n=c_sfe(a)) return(n);
+	f__elist=a;
+	f__hiwater = f__cursor=f__recpos=0;
+	f__nonl = 0;
+	f__scale=0;
+	f__fmtbuf=a->cifmt;
+	f__cf=f__curunit->ufd;
+	if(pars_f(f__fmtbuf)<0) err(a->cierr,100,"startio");
+	f__putn= x_putc;
+	f__doed= w_ed;
+	f__doned= w_ned;
+	f__doend=xw_end;
+	f__dorevert=xw_rev;
+	f__donewrec=x_wSL;
+	fmt_bg();
+	f__cplus=0;
+	f__cblank=f__curunit->ublnk;
+	if(f__curunit->uwrt != 1 && f__nowwriting(f__curunit))
+		err(a->cierr,errno,"write start");
+	return(0);
+}
+#ifdef __cplusplus
+}
+#endif

+ 65 - 0
min-dgels/additional/xerbla.c

@@ -0,0 +1,65 @@
+/* xerbla.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+#include "stdio.h"
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int xerbla_(char *srname, integer *info)
+{
+    
+
+/*  -- LAPACK auxiliary routine (version 3.2) -- */
+/*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
+/*     November 2006 */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*  XERBLA  is an error handler for the LAPACK routines. */
+/*  It is called by an LAPACK routine if an input parameter has an */
+/*  invalid value.  A message is printed and execution stops. */
+
+/*  Installers may consider modifying the STOP statement in order to */
+/*  call system-specific exception-handling facilities. */
+
+/*  Arguments */
+/*  ========= */
+
+/*  SRNAME  (input) CHARACTER*(*) */
+/*          The name of the routine which called XERBLA. */
+
+/*  INFO    (input) INTEGER */
+/*          The position of the invalid parameter in the parameter list */
+/*          of the calling routine. */
+
+/* ===================================================================== */
+
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    printf("** On entry to %s, parameter number %ld had an illegal value\n",
+		srname, *info);
+
+
+/*     End of XERBLA */
+
+    return 0;
+} /* xerbla_ */

+ 115 - 0
min-dgels/base/BLAS/SRC/Makefile

@@ -0,0 +1,115 @@
+TOPDIR=../..
+include $(TOPDIR)/make.inc
+
+#######################################################################
+#  This is the makefile to create a library for the BLAS.
+#  The files are grouped as follows:
+#
+#       SBLAS1 -- Single precision real BLAS routines
+#       CBLAS1 -- Single precision complex BLAS routines
+#       DBLAS1 -- Double precision real BLAS routines
+#       ZBLAS1 -- Double precision complex BLAS routines
+#
+#       CB1AUX -- Real BLAS routines called by complex routines
+#       ZB1AUX -- D.P. real BLAS routines called by d.p. complex
+#                 routines
+#
+#      ALLBLAS -- Auxiliary routines for Level 2 and 3 BLAS
+#
+#       SBLAS2 -- Single precision real BLAS2 routines
+#       CBLAS2 -- Single precision complex BLAS2 routines
+#       DBLAS2 -- Double precision real BLAS2 routines
+#       ZBLAS2 -- Double precision complex BLAS2 routines
+#
+#       SBLAS3 -- Single precision real BLAS3 routines
+#       CBLAS3 -- Single precision complex BLAS3 routines
+#       DBLAS3 -- Double precision real BLAS3 routines
+#       ZBLAS3 -- Double precision complex BLAS3 routines
+#
+#  The library can be set up to include routines for any combination
+#  of the four precisions.  To create or add to the library, enter make
+#  followed by one or more of the precisions desired.  Some examples:
+#       make single
+#       make single complex
+#       make single double complex complex16
+#  Note that these commands are not safe for parallel builds.
+#
+#  Alternatively, the commands
+#       make all
+#  or
+#       make
+#  without any arguments creates a library of all four precisions.
+#  The name of the library is held in BLASLIB, which is set in the
+#  top-level make.inc
+#
+#  To remove the object files after the library is created, enter
+#       make clean
+#  To force the source files to be recompiled, enter, for example,
+#       make single FRC=FRC
+#
+#---------------------------------------------------------------------
+#
+#  Edward Anderson, University of Tennessee
+#  March 26, 1990
+#  Susan Ostrouchov, Last updated September 30, 1994
+#  ejr, May 2006.
+#
+#######################################################################
+
+all: $(BLASLIB)
+ 
+#---------------------------------------------------------
+#  Comment out the next 6 definitions if you already have
+#  the Level 1 BLAS.
+#---------------------------------------------------------
+
+DBLAS1 = idamax.o dasum.o daxpy.o dcopy.o ddot.o dnrm2.o \
+	drot.o drotg.o dscal.o dsdot.o dswap.o drotmg.o drotm.o
+$(DBLAS1): $(FRC)
+
+#---------------------------------------------------------------------
+#  The following line defines auxiliary routines needed by both the
+#  Level 2 and Level 3 BLAS.  Comment it out only if you already have
+#  both the Level 2 and 3 BLAS.
+#---------------------------------------------------------------------
+
+ALLBLAS  = lsame.o xerbla.o xerbla_array.o
+$(ALLBLAS) : $(FRC)
+
+#---------------------------------------------------------
+#  Comment out the next 4 definitions if you already have
+#  the Level 2 BLAS.
+#---------------------------------------------------------
+
+DBLAS2 = dgemv.o dgbmv.o dsymv.o dsbmv.o dspmv.o \
+	dtrmv.o dtbmv.o dtpmv.o dtrsv.o dtbsv.o dtpsv.o \
+	dger.o dsyr.o dspr.o dsyr2.o dspr2.o
+$(DBLAS2): $(FRC)
+
+#---------------------------------------------------------
+#  Comment out the next 4 definitions if you already have
+#  the Level 3 BLAS.
+#---------------------------------------------------------
+
+DBLAS3 = dgemm.o dsymm.o dsyrk.o dsyr2k.o dtrmm.o dtrsm.o
+$(DBLAS3): $(FRC)
+
+ALLOBJ= $(DBLAS1) $(DBLAS2) $(DBLAS3) $(ALLBLAS)
+
+$(BLASLIB): $(ALLOBJ)
+	$(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ)
+	$(RANLIB) $@
+
+double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3)
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $(DBLAS1) $(ALLBLAS) \
+	$(DBLAS2) $(DBLAS3)
+	$(RANLIB) $(BLASLIB)
+
+FRC:
+	@FRC=$(FRC)
+
+clean:
+	rm -f *.o
+
+.c.o: 
+	$(CC) $(CFLAGS) -c $< -o $@

+ 101 - 0
min-dgels/base/BLAS/SRC/dasum.c

@@ -0,0 +1,101 @@
+/* dasum.f -- translated by f2c (version 20061008).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+#include "blaswrap.h"
+
+doublereal dasum_(integer *n, doublereal *dx, integer *incx)
+{
+    /* System generated locals */
+    integer i__1, i__2;
+    doublereal ret_val, d__1, d__2, d__3, d__4, d__5, d__6;
+
+    /* Local variables */
+    integer i__, m, mp1;
+    doublereal dtemp;
+    integer nincx;
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/*  Purpose */
+/*  ======= */
+
+/*     takes the sum of the absolute values. */
+/*     jack dongarra, linpack, 3/11/78. */
+/*     modified 3/93 to return if incx .le. 0. */
+/*     modified 12/3/93, array(1) declarations changed to array(*) */
+
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+    /* Parameter adjustments */
+    --dx;
+
+    /* Function Body */
+    ret_val = 0.;
+    dtemp = 0.;
+    if (*n <= 0 || *incx <= 0) {
+	return ret_val;
+    }
+    if (*incx == 1) {
+	goto L20;
+    }
+
+/*        code for increment not equal to 1 */
+
+    nincx = *n * *incx;
+    i__1 = nincx;
+    i__2 = *incx;
+    for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
+	dtemp += (d__1 = dx[i__], abs(d__1));
+/* L10: */
+    }
+    ret_val = dtemp;
+    return ret_val;
+
+/*        code for increment equal to 1 */
+
+
+/*        clean-up loop */
+
+L20:
+    m = *n % 6;
+    if (m == 0) {
+	goto L40;
+    }
+    i__2 = m;
+    for (i__ = 1; i__ <= i__2; ++i__) {
+	dtemp += (d__1 = dx[i__], abs(d__1));
+/* L30: */
+    }
+    if (*n < 6) {
+	goto L60;
+    }
+L40:
+    mp1 = m + 1;
+    i__2 = *n;
+    for (i__ = mp1; i__ <= i__2; i__ += 6) {
+	dtemp = dtemp + (d__1 = dx[i__], abs(d__1)) + (d__2 = dx[i__ + 1], 
+		abs(d__2)) + (d__3 = dx[i__ + 2], abs(d__3)) + (d__4 = dx[i__ 
+		+ 3], abs(d__4)) + (d__5 = dx[i__ + 4], abs(d__5)) + (d__6 = 
+		dx[i__ + 5], abs(d__6));
+/* L50: */
+    }
+L60:
+    ret_val = dtemp;
+    return ret_val;
+} /* dasum_ */

+ 0 - 0
min-dgels/base/BLAS/SRC/daxpy.c


Некоторые файлы не были показаны из-за большого количества измененных файлов