Просмотр исходного кода

julia: Add support for BLAS and axpy example.

Pierre Huchant лет назад: 5
Родитель
Сommit
0df0551acb

+ 4 - 0
julia/examples/Makefile.am

@@ -100,3 +100,7 @@ SHELL_TESTS			+=	callback/callback.sh
 SHELL_TESTS			+=	dependency/tag_dep.sh
 SHELL_TESTS			+=	dependency/task_dep.sh
 SHELL_TESTS			+=	dependency/end_dep.sh
+
+if !NO_BLAS_LIB
+SHELL_TESTS			+= axpy/axpy.sh
+endif

+ 89 - 0
julia/examples/axpy/axpy.jl

@@ -0,0 +1,89 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+using StarPU
+
+const EPSILON = 1e-6
+
+function check(alpha, X, Y)
+    for i in 1:length(X)
+        expected_value = alpha * X[i] + 4.0
+        if abs(Y[i] - expected_value) > expected_value * EPSILON
+            error("at ", i, ", ", alpha, "*", X[i], "+4.0=", Y[i], ", expected ", expected_value)
+        end
+    end
+end
+
+function main()
+    N = 16 * 1024 * 1024
+    NBLOCKS = 8
+    alpha = 3.41
+
+    starpu_init()
+    starpu_cublas_init()
+
+    X = Array(fill(1.0f0, N))
+    Y = Array(fill(4.0f0, N))
+
+    starpu_memory_pin(X)
+    starpu_memory_pin(Y)
+
+    println("BEFORE x[0] = ", X[1])
+    println("BEFORE y[0] = ", Y[1])
+
+    block_filter = starpu_data_filter(STARPU_VECTOR_FILTER_BLOCK, NBLOCKS)
+
+    perfmodel = starpu_perfmodel(
+        perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
+        symbol = "history_perf"
+    )
+
+    cl = starpu_codelet(
+        cpu_func = STARPU_SAXPY,
+        cuda_func = STARPU_SAXPY,
+        modes = [STARPU_R, STARPU_RW],
+        perfmodel = perfmodel
+    )
+
+    @starpu_block let
+        hX,hY = starpu_data_register(X, Y)
+
+        starpu_data_partition(hX, block_filter)
+        starpu_data_partition(hY, block_filter)
+
+        t_start = time_ns()
+
+        for b in 1:NBLOCKS
+            task = starpu_task(cl = cl, handles = [hX[b],hY[b]], cl_arg=(Float32(alpha),),
+                               tag=starpu_tag_t(b))
+            starpu_task_submit(task)
+        end
+        starpu_task_wait_for_all()
+
+        t_end = time_ns()
+        timing = (t_end - t_start) / 1000
+
+        println("timing -> ", timing, " us ", 3*N*4/timing, "MB/s")
+
+    end
+
+    println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
+
+    check(alpha, X, Y)
+
+    starpu_shutdown()
+end
+
+main()

+ 19 - 0
julia/examples/axpy/axpy.sh

@@ -0,0 +1,19 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+$(dirname $0)/../execute.sh axpy/axpy.jl
+

+ 3 - 1
julia/src/Makefile.am

@@ -33,4 +33,6 @@ libstarpujulia_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined
   -version-info $(LIBSTARPUJULIA_INTERFACE_CURRENT):$(LIBSTARPUJULIA_INTERFACE_REVISION):$(LIBSTARPUJULIA_INTERFACE_AGE)
 
 libstarpujulia_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
-	callback_wrapper.c
+	callback_wrapper.c \
+	blas_wrapper.c \
+	blas.c

+ 7 - 0
julia/src/StarPU.jl

@@ -39,6 +39,7 @@ include("linked_list.jl")
 include("destructible.jl")
 include("perfmodel.jl")
 include("data.jl")
+include("blas.jl")
 include("task.jl")
 include("task_dep.jl")
 include("init.jl")
@@ -52,10 +53,12 @@ export @starpu_sync_tasks
 # enum / define
 export STARPU_CPU
 export STARPU_CUDA
+export STARPU_CUDA_ASYNC
 export STARPU_OPENCL
 export STARPU_MAIN_RAM
 export StarpuDataFilterFunc
 export STARPU_MATRIX_FILTER_VERTICAL_BLOCK, STARPU_MATRIX_FILTER_BLOCK
+export STARPU_VECTOR_FILTER_BLOCK
 export STARPU_PERFMODEL_INVALID, STARPU_PER_ARCH, STARPU_COMMON
 export STARPU_HISTORY_BASED, STARPU_REGRESSION_BASED
 export STARPU_NL_REGRESSION_BASED, STARPU_MULTIPLE_REGRESSION_BASED
@@ -64,7 +67,11 @@ export STARPU_NONE,STARPU_R,STARPU_W,STARPU_RW, STARPU_SCRATCH
 export STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
 export STARPU_ACCESS_MODE_MAX
 
+# BLAS
+export STARPU_SAXPY
+
 # functions
+export starpu_cublas_init
 export starpu_init
 export starpu_shutdown
 export starpu_memory_pin

+ 516 - 0
julia/src/blas.c

@@ -0,0 +1,516 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include <starpu.h>
+#include "blas.h"
+
+/*
+    This files contains BLAS wrappers for the different BLAS implementations
+  (eg. REFBLAS, ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
+  libraries do not supply C-based ordering.
+ */
+
+#ifdef STARPU_ATLAS
+
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
+			float alpha, const float *A, int lda, const float *B, int ldb, 
+			float beta, float *C, int ldc)
+{
+	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_TRANSPOSE tb = (toupper(transb[0]) == 'N')?CblasNoTrans:CblasTrans;
+
+	cblas_sgemm(CblasColMajor, ta, tb,
+			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
+}
+
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
+			double alpha, double *A, int lda, double *B, int ldb, 
+			double beta, double *C, int ldc)
+{
+	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_TRANSPOSE tb = (toupper(transb[0]) == 'N')?CblasNoTrans:CblasTrans;
+
+	cblas_dgemm(CblasColMajor, ta, tb,
+			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
+}
+
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda, float *X, int incX, float beta, float *Y, int incY)
+{
+	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+
+	cblas_sgemv(CblasColMajor, ta, M, N, alpha, A, lda,
+					X, incX, beta, Y, incY);
+}
+
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda, double *X, int incX, double beta, double *Y, int incY)
+{
+	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+
+	cblas_dgemv(CblasColMajor, ta, M, N, alpha, A, lda,
+					X, incX, beta, Y, incY);
+}
+
+inline float STARPU_SASUM(int N, float *X, int incX)
+{
+	return cblas_sasum(N, X, incX);
+}
+
+inline double STARPU_DASUM(int N, double *X, int incX)
+{
+	return cblas_dasum(N, X, incX);
+}
+
+void STARPU_SSCAL(int N, float alpha, float *X, int incX)
+{
+	cblas_sscal(N, alpha, X, incX);
+}
+
+void STARPU_DSCAL(int N, double alpha, double *X, int incX)
+{
+	cblas_dscal(N, alpha, X, incX);
+}
+
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const float alpha, const float *A, const int lda,
+                   float *B, const int ldb)
+{
+	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transa_ = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
+}
+
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const double alpha, const double *A, const int lda,
+                   double *B, const int ldb)
+{
+	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transa_ = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_dtrsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
+}
+
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
+                  const float *x, const int incx, float *A, const int lda)
+{
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+
+	cblas_ssyr(CblasColMajor, uplo_, n, alpha, x, incx, A, lda); 
+}
+
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const float alpha, const float *A,
+                   const int lda, const float beta, float *C,
+                   const int ldc)
+{
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE trans_ = (toupper(trans[0]) == 'N')?CblasNoTrans:CblasTrans;
+	
+	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
+}
+
+void STARPU_SGER(const int m, const int n, const float alpha,
+                  const float *x, const int incx, const float *y,
+                  const int incy, float *A, const int lda)
+{
+	cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void STARPU_DGER(const int m, const int n, const double alpha,
+                  const double *x, const int incx, const double *y,
+                  const int incy, double *A, const int lda)
+{
+	cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const float *A, const int lda, float *x, 
+                   const int incx)
+{
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE trans_ = (toupper(trans[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_strsv(CblasColMajor, uplo_, trans_, diag_, n, A, lda, x, incx);
+}
+
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb)
+{
+	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
+}
+
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb)
+{
+	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_dtrmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
+}
+
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const float *A, const int lda, float *X,
+                 const int incX)
+{
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_strmv(CblasColMajor, uplo_, transA_, diag_, n, A, lda, X, incX);
+}
+
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+{
+	cblas_saxpy(n, alpha, X, incX, Y, incY);
+}
+
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+{
+	cblas_daxpy(n, alpha, X, incX, Y, incY);
+}
+
+int STARPU_ISAMAX (const int n, float *X, const int incX)
+{
+    int retVal;
+    retVal = cblas_isamax(n, X, incX);
+    return retVal;
+}
+
+int STARPU_IDAMAX (const int n, double *X, const int incX)
+{
+    int retVal;
+    retVal = cblas_idamax(n, X, incX);
+    return retVal;
+}
+
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+{
+	return cblas_sdot(n, x, incx, y, incy);
+}
+
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
+{
+	return cblas_ddot(n, x, incx, y, incy);
+}
+
+void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy)
+{
+	cblas_sswap(n, x, incx, y, incy);
+}
+
+void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy)
+{
+	cblas_dswap(n, x, incx, y, incy);
+}
+
+#elif defined(STARPU_GOTO) || defined(STARPU_OPENBLAS) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL) || defined(STARPU_ARMPL)
+
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
+			float alpha, const float *A, int lda, const float *B, int ldb, 
+			float beta, float *C, int ldc)
+{
+	sgemm_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
+			double alpha, double *A, int lda, double *B, int ldb, 
+			double beta, double *C, int ldc)
+{
+	dgemm_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+		float *X, int incX, float beta, float *Y, int incY)
+{
+	sgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+}
+
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+		double *X, int incX, double beta, double *Y, int incY)
+{
+	dgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+}
+
+inline float STARPU_SASUM(int N, float *X, int incX)
+{
+	return sasum_(&N, X, &incX);
+}
+
+inline double STARPU_DASUM(int N, double *X, int incX)
+{
+	return dasum_(&N, X, &incX);
+}
+
+void STARPU_SSCAL(int N, float alpha, float *X, int incX)
+{
+	sscal_(&N, &alpha, X, &incX);
+}
+
+void STARPU_DSCAL(int N, double alpha, double *X, int incX)
+{
+	dscal_(&N, &alpha, X, &incX);
+}
+
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const float alpha, const float *A, const int lda,
+                   float *B, const int ldb)
+{
+	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const double alpha, const double *A, const int lda,
+                   double *B, const int ldb)
+{
+	dtrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
+                  const float *x, const int incx, float *A, const int lda)
+{
+	ssyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
+}
+
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const float alpha, const float *A,
+                   const int lda, const float beta, float *C,
+                   const int ldc)
+{
+	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
+}
+
+void STARPU_SGER(const int m, const int n, const float alpha,
+                  const float *x, const int incx, const float *y,
+                  const int incy, float *A, const int lda)
+{
+	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+void STARPU_DGER(const int m, const int n, const double alpha,
+                  const double *x, const int incx, const double *y,
+                  const int incy, double *A, const int lda)
+{
+	dger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const float *A, const int lda, float *x, 
+                   const int incx)
+{
+	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
+}
+
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb)
+{
+	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb)
+{
+	dtrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const float *A, const int lda, float *X,
+                 const int incX)
+{
+	strmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
+}
+
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+{
+	saxpy_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+{
+	daxpy_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+int STARPU_ISAMAX (const int n, float *X, const int incX)
+{
+    int retVal;
+    retVal = isamax_ (&n, X, &incX);
+    return retVal;
+}
+
+int STARPU_IDAMAX (const int n, double *X, const int incX)
+{
+    int retVal;
+    retVal = idamax_ (&n, X, &incX);
+    return retVal;
+}
+
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+{
+	float retVal = 0;
+
+	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
+	retVal = (float)sdot_(&n, x, &incx, y, &incy);
+
+	return retVal;
+}
+
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy)
+{
+	return ddot_(&n, x, &incx, y, &incy);
+}
+
+void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
+{
+	sswap_(&n, X, &incX, Y, &incY);
+}
+
+void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
+{
+	dswap_(&n, X, &incX, Y, &incY);
+}
+
+#if defined(STARPU_MKL) || defined(STARPU_ARMPL)
+void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda)
+{
+	int info = 0;
+	spotrf_(uplo, &n, a, &lda, &info);
+}
+
+void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda)
+{
+	int info = 0;
+	dpotrf_(uplo, &n, a, &lda, &info);
+}
+#endif
+
+#elif defined(STARPU_SIMGRID)
+inline void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, 
+			float alpha, const float *A, int lda, const float *B, int ldb, 
+			float beta, float *C, int ldc) { }
+
+inline void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
+			double alpha, double *A, int lda, double *B, int ldb, 
+			double beta, double *C, int ldc) { }
+
+inline void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+		float *X, int incX, float beta, float *Y, int incY) { }
+
+inline void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+		double *X, int incX, double beta, double *Y, int incY) { }
+
+inline float STARPU_SASUM(int N, float *X, int incX) { return 0.; }
+
+inline double STARPU_DASUM(int N, double *X, int incX) { return 0.; }
+
+void STARPU_SSCAL(int N, float alpha, float *X, int incX) { }
+
+void STARPU_DSCAL(int N, double alpha, double *X, int incX) { }
+
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const float alpha, const float *A, const int lda,
+                   float *B, const int ldb) { }
+
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const double alpha, const double *A, const int lda,
+                   double *B, const int ldb) { }
+
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
+                  const float *x, const int incx, float *A, const int lda) { }
+
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const float alpha, const float *A,
+                   const int lda, const float beta, float *C,
+                   const int ldc) { }
+
+void STARPU_SGER(const int m, const int n, const float alpha,
+                  const float *x, const int incx, const float *y,
+                  const int incy, float *A, const int lda) { }
+
+void STARPU_DGER(const int m, const int n, const double alpha,
+                  const double *x, const int incx, const double *y,
+                  const int incy, double *A, const int lda) { }
+
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const float *A, const int lda, float *x, 
+                   const int incx) { }
+
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb) { }
+
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb) { }
+
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const float *A, const int lda, float *X,
+                 const int incX) { }
+
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { }
+
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY) { }
+
+int STARPU_ISAMAX (const int n, float *X, const int incX) { return 0; }
+
+int STARPU_IDAMAX (const int n, double *X, const int incX) { return 0; }
+
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy) { return 0.; }
+
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy) { return 0.; }
+
+void STARPU_SSWAP(const int n, float *X, const int incX, float *Y, const int incY) { }
+
+void STARPU_DSWAP(const int n, double *X, const int incX, double *Y, const int incY) { }
+
+void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda) { }
+
+void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda) { }
+#endif

+ 166 - 0
julia/src/blas.h

@@ -0,0 +1,166 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLAS_H__
+#define __BLAS_H__
+
+#include <starpu.h>
+
+#if defined(STARPU_ATLAS) || defined(STARPU_HAVE_CBLAS_H)
+#include <cblas.h>
+#endif
+
+void STARPU_SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, const float *A, int lda, 
+		const float *B, int ldb, float beta, float *C, int ldc);
+void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
+		double *B, int ldb, double beta, double *C, int ldc);
+void STARPU_SGEMV(char *transa, int M, int N, float alpha, float *A, int lda,
+		float *X, int incX, float beta, float *Y, int incY);
+void STARPU_DGEMV(char *transa, int M, int N, double alpha, double *A, int lda,
+		double *X, int incX, double beta, double *Y, int incY);
+float STARPU_SASUM(int N, float *X, int incX);
+double STARPU_DASUM(int N, double *X, int incX);
+void STARPU_SSCAL(int N, float alpha, float *X, int incX);
+void STARPU_DSCAL(int N, double alpha, double *X, int incX);
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const float alpha, const float *A, const int lda,
+                   float *B, const int ldb);
+void STARPU_DTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const double alpha, const double *A, const int lda,
+                   double *B, const int ldb);
+void STARPU_DGEMM(char *transa, char *transb, int M, int N, int K, 
+			double alpha, double *A, int lda, double *B, int ldb, 
+			double beta, double *C, int ldc);
+void STARPU_SSYR (const char *uplo, const int n, const float alpha,
+                  const float *x, const int incx, float *A, const int lda);
+void STARPU_SSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const float alpha, const float *A,
+                   const int lda, const float beta, float *C,
+                   const int ldc);
+void STARPU_SGER (const int m, const int n, const float alpha,
+                  const float *x, const int incx, const float *y,
+                  const int incy, float *A, const int lda);
+void STARPU_DGER(const int m, const int n, const double alpha,
+                  const double *x, const int incx, const double *y,
+                  const int incy, double *A, const int lda);
+void STARPU_STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const float *A, const int lda, float *x, 
+                   const int incx);
+void STARPU_STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+void STARPU_DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+void STARPU_STRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const float *A, const int lda, float *X,
+                 const int incX);
+void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
+void STARPU_DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
+int STARPU_ISAMAX (const int n, float *X, const int incX);
+int STARPU_IDAMAX (const int n, double *X, const int incX);
+float STARPU_SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
+double STARPU_DDOT(const int n, const double *x, const int incx, const double *y, const int incy);
+void STARPU_SSWAP(const int n, float *x, const int incx, float *y, const int incy);
+void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy);
+
+#if defined(STARPU_MKL) || defined(STARPU_ARMPL)
+void STARPU_SPOTRF(const char*uplo, const int n, float *a, const int lda);
+void STARPU_DPOTRF(const char*uplo, const int n, double *a, const int lda);
+#endif
+
+#if defined(STARPU_GOTO) || defined(STARPU_OPENBLAS) || defined(STARPU_SYSTEM_BLAS) || defined(STARPU_MKL) || defined(STARPU_ARMPL)
+
+extern void sgemm_ (const char *transa, const char *transb, const int *m,
+                   const int *n, const int *k, const float *alpha, 
+                   const float *A, const int *lda, const float *B, 
+                   const int *ldb, const float *beta, float *C, 
+                   const int *ldc);
+extern void dgemm_ (const char *transa, const char *transb, const int *m,
+                   const int *n, const int *k, const double *alpha, 
+                   const double *A, const int *lda, const double *B, 
+                   const int *ldb, const double *beta, double *C, 
+                   const int *ldc);
+extern void sgemv_(const char *trans, const int *m, const int *n, const float *alpha,
+                   const float *a, const int *lda, const float *x, const int *incx, 
+                   const float *beta, float *y, const int *incy);
+extern void dgemv_(const char *trans, const int *m, const int *n, const double *alpha,
+                   const double *a, const int *lda, const double *x, const int *incx,
+                   const double *beta, double *y, const int *incy);
+extern void ssyr_ (const char *uplo, const int *n, const float *alpha,
+                  const float *x, const int *incx, float *A, const int *lda);
+extern void ssyrk_ (const char *uplo, const char *trans, const int *n,
+                   const int *k, const float *alpha, const float *A,
+                   const int *lda, const float *beta, float *C,
+                   const int *ldc);
+extern void strsm_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int *m, const int *n,
+                   const float *alpha, const float *A, const int *lda,
+                   float *B, const int *ldb);
+extern void dtrsm_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int *m, const int *n,
+                   const double *alpha, const double *A, const int *lda,
+                   double *B, const int *ldb);
+extern double sasum_ (const int *n, const float *x, const int *incx);
+extern double dasum_ (const int *n, const double *x, const int *incx);
+extern void sscal_ (const int *n, const float *alpha, float *x,
+                   const int *incx);
+extern void dscal_ (const int *n, const double *alpha, double *x,
+                   const int *incx);
+extern void sger_(const int *m, const int *n, const float *alpha,
+                  const float *x, const int *incx, const float *y,
+                  const int *incy, float *A, const int *lda);
+extern void dger_(const int *m, const int *n, const double *alpha,
+                  const double *x, const int *incx, const double *y,
+                  const int *incy, double *A, const int *lda);
+extern void strsv_ (const char *uplo, const char *trans, const char *diag, 
+                   const int *n, const float *A, const int *lda, float *x, 
+                   const int *incx);
+extern void strmm_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int *m, const int *n,
+                 const float *alpha, const float *A, const int *lda,
+                 float *B, const int *ldb);
+extern void dtrmm_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int *m, const int *n,
+                 const double *alpha, const double *A, const int *lda,
+                 double *B, const int *ldb);
+extern void strmv_(const char *uplo, const char *transA, const char *diag,
+                 const int *n, const float *A, const int *lda, float *X,
+                 const int *incX);
+extern void saxpy_(const int *n, const float *alpha, const float *X, const int *incX,
+		float *Y, const int *incy);
+extern void daxpy_(const int *n, const double *alpha, const double *X, const int *incX,
+		double *Y, const int *incy);
+extern int isamax_(const int *n, const float *X, const int *incX);
+extern int idamax_(const int *n, const double *X, const int *incX);
+/* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
+extern double sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy);
+extern double ddot_(const int *n, const double *x, const int *incx, const double *y, const int *incy);
+extern void sswap_(const int *n, float *x, const int *incx, float *y, const int *incy);
+extern void dswap_(const int *n, double *x, const int *incx, double *y, const int *incy);
+
+#if (defined STARPU_MKL) || (defined STARPU_ARMPL)
+extern void spotrf_(const char*uplo, const int *n, float *a, const int *lda, int *info);
+extern void dpotrf_(const char*uplo, const int *n, double *a, const int *lda, int *info);
+#endif
+
+#endif
+
+#endif /* __BLAS_H__ */

+ 6 - 0
julia/src/blas.jl

@@ -0,0 +1,6 @@
+@enum STARPU_BLAS begin
+    STARPU_SAXPY
+end
+
+cuda_blas_codelets = Dict(STARPU_SAXPY => "julia_saxpy_cuda_codelet")
+cpu_blas_codelets = Dict(STARPU_SAXPY => "julia_saxpy_cpu_codelet")

+ 35 - 0
julia/src/blas_wrapper.c

@@ -0,0 +1,35 @@
+#include <starpu.h>
+#include <blas.h>
+
+#if defined(STARPU_ATLAS) || defined(STARPU_OPENBLAS) || defined(STARPU_MKL)
+void julia_saxpy_cpu_codelet(void *descr[], void *arg)
+{
+	float alpha = *((float *)arg);
+
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	STARPU_SAXPY((int)n, alpha, block_x, 1, block_y, 1);
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+
+#include <starpu_cublas_v2.h>
+
+void julia_saxpy_cuda_codelet(void *descr[], void *arg)
+{
+	float alpha = *((float *)arg);
+
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	cublasStatus_t status = cublasSaxpy(starpu_cublas_get_local_handle(), (int)n, &alpha, block_x, 1, block_y, 1);
+	if (status != CUBLAS_STATUS_SUCCESS)
+		STARPU_CUBLAS_REPORT_ERROR(status);
+}
+#endif

+ 21 - 7
julia/src/task.jl

@@ -15,11 +15,11 @@
 #
 using ThreadPools
 
-struct jl_starpu_codelet
+mutable struct jl_starpu_codelet
     c_codelet :: starpu_codelet
     perfmodel :: starpu_perfmodel
-    cpu_func :: String
-    cuda_func :: String
+    cpu_func :: Union{String, STARPU_BLAS}
+    cuda_func :: Union{String, STARPU_BLAS}
     opencl_func :: String
     modes
 end
@@ -27,8 +27,8 @@ end
 global codelet_list = Vector{jl_starpu_codelet}()
 
 function starpu_codelet(;
-                        cpu_func :: String = "",
-                        cuda_func :: String = "",
+                        cpu_func :: Union{String, STARPU_BLAS} = "",
+                        cuda_func :: Union{String, STARPU_BLAS} = "",
                         opencl_func :: String = "",
                         modes = [],
                         perfmodel :: starpu_perfmodel,
@@ -58,8 +58,22 @@ function starpu_codelet(;
     output.c_codelet.nbuffers = length(modes)
     output.c_codelet.model = pointer_from_objref(perfmodel)
     output.c_codelet.color = color
-    output.c_codelet.cpu_func = load_starpu_function_pointer(cpu_func)
-    output.c_codelet.cuda_func = load_starpu_function_pointer(cuda_func)
+
+    if typeof(cpu_func) == STARPU_BLAS
+        output.cpu_func = cpu_blas_codelets[cpu_func]
+        output.c_codelet.cpu_func = load_wrapper_function_pointer(output.cpu_func)
+    else
+        output.c_codelet.cpu_func = load_starpu_function_pointer(cpu_func)
+    end
+
+    if typeof(cuda_func) == STARPU_BLAS
+        output.cuda_func = cuda_blas_codelets[cuda_func]
+        output.c_codelet.cuda_func = load_wrapper_function_pointer(output.cuda_func)
+        output.c_codelet.cuda_flags[1] = STARPU_CUDA_ASYNC
+    else
+        output.c_codelet.cuda_func = load_starpu_function_pointer(cuda_func)
+    end
+
     output.c_codelet.opencl_func = load_starpu_function_pointer(opencl_func)
 
     # Codelets must not be garbage collected before starpu shutdown is called.

+ 2 - 0
julia/src/translate_headers.jl

@@ -35,6 +35,7 @@ function starpu_translate_headers()
     end
 
     only_select_symbols = Set(["starpu_task",
+                               "starpu_cublas_init",
                                "starpu_codelet",
                                "starpu_data_filter",
                                "starpu_tag_t",
@@ -75,6 +76,7 @@ function starpu_translate_headers()
                                "starpu_iteration_pop",
                                "STARPU_CPU",
                                "STARPU_CUDA",
+                               "STARPU_CUDA_ASYNC",
                                "STARPU_OPENCL",
                                "STARPU_MAIN_RAM",
                                "STARPU_NMAXBUFS"])