Browse Source

julia: Add support for cublas.

Pierre Huchant 5 years ago
parent
commit
080efafb6e
3 changed files with 187 additions and 34 deletions
  1. 9 3
      julia/examples/axpy/axpy.jl
  2. 166 25
      julia/src/compiler/cuda.jl
  3. 12 6
      julia/src/compiler/file_generation.jl

+ 9 - 3
julia/examples/axpy/axpy.jl

@@ -26,6 +26,12 @@ function check(alpha, X, Y)
     end
 end
 
+@target STARPU_CPU+STARPU_CUDA
+@codelet function axpy(X :: Vector{Float32}, Y :: Vector{Float32}, alpha ::Float32) :: Nothing
+    STARPU_SAXPY(length(X), alpha, X, 1, Y, 1)
+    return
+end
+
 function axpy(N, NBLOCKS, alpha, display = true)
     X = Array(fill(1.0f0, N))
     Y = Array(fill(4.0f0, N))
@@ -41,8 +47,9 @@ function axpy(N, NBLOCKS, alpha, display = true)
     )
 
     cl = starpu_codelet(
-        cpu_func = STARPU_SAXPY,
-        cuda_func = STARPU_SAXPY,
+        cpu_func = CPU_CODELETS["axpy"],
+        cuda_func = CUDA_CODELETS["axpy"],
+        #cuda_func = STARPU_SAXPY,
         modes = [STARPU_R, STARPU_RW],
         perfmodel = perfmodel
     )
@@ -75,7 +82,6 @@ function axpy(N, NBLOCKS, alpha, display = true)
 
     if display
         @printf("timing -> %d us %.2f MB/s\n", timing, 3*N*4/timing)
-        # println("timing -> ", timing, " us ", floor(3*N*4/timing), "MB/s")
         println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
     end
 

+ 166 - 25
julia/src/compiler/cuda.jl

@@ -144,7 +144,145 @@ function add_device_to_interval_call(expr :: StarpuExpr)
     return apply(func_to_apply, expr)
 end
 
+function translate_cublas(expr :: StarpuExpr)
+    function func_to_run(x :: StarpuExpr)
+        # STARPU_BLAS => (CUBLAS, TRANS, FILLMODE, ALPHA, SIDE, DIAG)
+        blas_to_cublas = Dict(:STARPU_SGEMM  => (:cublasSgemm, [1, 2], [], [6, 11], [], []),
+                              :STARPU_DGEMM  => (:cublasDgemm, [1, 2], [], [6, 11], [], []),
+                              :STARPU_SGEMV  => (:cublasSgemv, [1], [], [4,9], [], []),
+                              :STARPU_DGEMV  => (:cublasDgemv, [1], [], [4,9], [], []),
+                              :STARPU_SSCAL  => (:cublasSscal, [], [], [2], [], []),
+                              :STARPU_DSCAL  => (:cublasDscal, [], [], [2], [], []),
+                              :STARPU_STRSM  => (:cublasStrsm, [3], [2], [7,10], [1], [4]),
+                              :STARPU_DTRSM  => (:cublasDtrsm, [3], [2], [7,10], [1], [4]),
+                              :STARPU_SSYR   => (:cublasSsyr, [], [1], [3], [], []),
+                              :STARPU_SSYRK  => (:cublasSsyrk, [2], [1], [5,8], [], []),
+                              :STARPU_SGER   => (:cublasSger, [], [], [3], [], []),
+                              :STARPU_DGER   => (:cublasDger, [], [], [3], [], []),
+                              :STARPU_STRSV  => (:cublasStrsv, [2], [1], [], [], [3]),
+                              :STARPU_STRMM  => (:cublasStrmm, [3], [2], [7], [1], [4]),
+                              :STARPU_DTRMM  => (:cublasDtrmm, [3], [2], [7], [1], [4]),
+                              :STARPU_STRMV  => (:cublasStrmv, [2], [1], [], [], [3]),
+                              :STARPU_SAXPY  => (:cublasSaxpy, [], [], [2], [], []),
+                              :STARPU_DAXPY  => (:cublasDaxpy, [], [], [2], [], []),
+                              :STARPU_SSWAP  => (:cublasSswap, [], [], [], [], []),
+                              :STARPU_DSWAP  => (:cublasDswap, [], [], [], [], []))
+
+        if !(isa(x, StarpuExprCall) && x.func in keys(blas_to_cublas))
+            return x
+        end
+
+        new_args = x.args
+
+        # cublasOperation_t parameters (e.g. StarpuExprValue("N")
+        for i in blas_to_cublas[x.func][2]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "N" || value == "n"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_N)
+            elseif value == "T" || value == "t"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_T)
+            elseif value == "C" || value == "c"
+                new_args[i] = StarpuExprVar(:CUBLAS_OP_C)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"N\", \"T\", or \"C\")")
+            end
+        end
+
+        # cublasFillMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][3]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
 
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_LOWER)
+            elseif value == "U" || value == "u"
+                new_args[i] = StarpuExprVar(:CUBLAS_FILL_MODE_UPPER)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"U\")")
+            end
+        end
+
+        # scalar parameters (alpha, beta, ...):  alpha -> &alpha
+        for i in blas_to_cublas[x.func][4]
+            if !isa(new_args[i], StarpuExprVar)
+                error("Argument $i of ", x.func, " must be a variable")
+            end
+            var_name = new_args[i].name
+            new_args[i] = StarpuExprVar(Symbol("&$var_name"))
+        end
+
+        # cublasSideMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][5]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_LEFT)
+            elseif value == "R" || value == "r"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_RIGHT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"R\")")
+            end
+        end
+
+        # cublasSideMode_t parameters (e.g. StarpuExprValue("L")
+        for i in blas_to_cublas[x.func][5]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "L" || value == "l"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_LEFT)
+            elseif value == "R" || value == "r"
+                new_args[i] = StarpuExprVar(:CUBLAS_SIDE_RIGHT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"L\" or \"R\")")
+            end
+        end
+
+        # cublasDiag_Typet parameters (e.g. StarpuExprValue("N")
+        for i in blas_to_cublas[x.func][6]
+            if !isa(new_args[i], StarpuExprValue) || !isa(new_args[i].value, String)
+                error("Argument $i of ", x.func, " must be a string")
+            end
+
+            value = new_args[i].value
+
+            if value == "N" || value == "n"
+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_NON_UNIT)
+            elseif value == "U" || value == "u"
+                new_args[i] = StarpuExprVar(:CUBLAS_DIAG_UNIT)
+            else
+                error("Unhandled value for rgument $i of ", x.func, ": ", value,
+                      "expecting (\"N\" or \"U\")")
+            end
+        end
+
+        new_args = [@parse(starpu_cublas_get_local_handle()), x.args...]
+
+        return StarpuExprBlock([StarpuExprCall(blas_to_cublas[x.func][1], new_args),
+                                @parse cudaStreamSynchronize(starpu_cuda_get_local_stream())])
+    end
+
+    return apply(func_to_run, expr)
+end
 
 function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
@@ -152,45 +290,48 @@ function transform_to_cuda_kernel(func :: StarpuExprFunction)
 
     init, indep, finish = extract_init_indep_finish(cpu_func.body)
 
-    if indep == nothing
-        error("No independant for loop has been found") # TODO can fail because extraction is not correct yet
-    end
+    cpu_instr = init
+    kernel = nothing
 
-    prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
+    # Generate a CUDA kernel only if there is an independent loop (@parallel macro).
+    if (indep != nothing)
+        prekernel_instr, kernel_args, kernel_instr = analyse_sets(indep)
 
-    kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
-    prekernel_instr = vcat(init, prekernel_instr)
-    kernel_instr = vcat(kernel_instr, indep.body)
+        kernel_call = StarpuExprCudaCall(:cudaKernel, (@parse nblocks), (@parse THREADS_PER_BLOCK), StarpuExpr[])
+        cpu_instr = vcat(cpu_instr, prekernel_instr)
+        kernel_instr = vcat(kernel_instr, indep.body)
 
-    indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
-    prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(prekernel_instr), cpu_func.args)
+        indep_for_def, indep_for_undef = analyse_variable_declarations(StarpuExprBlock(kernel_instr), kernel_args)
+        prekernel_def, prekernel_undef = analyse_variable_declarations(StarpuExprBlock(cpu_instr), cpu_func.args)
 
-    for undef_var in indep_for_undef
+        for undef_var in indep_for_undef
 
-        found_var = find_variable(undef_var, prekernel_def)
+            found_var = find_variable(undef_var, prekernel_def)
 
-        if found_var == nothing # TODO : error then ?
-            continue
+            if found_var == nothing # TODO : error then ?
+                continue
+            end
+
+            push!(kernel_args, found_var)
         end
 
-        push!(kernel_args, found_var)
+        call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
+        kernelname=Symbol("KERNEL_",func.func);
+        cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
+        push!(cpu_instr, cuda_call)
+        push!(cpu_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
+        kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
+        kernel = add_device_to_interval_call(kernel)
+        kernel = flatten_blocks(kernel)
     end
 
-    call_args = map((x -> StarpuExprVar(x.name)), kernel_args)
-    kernelname=Symbol("KERNEL_",func.func);
-    cuda_call = StarpuExprCudaCall(kernelname, (@parse nblocks), (@parse THREADS_PER_BLOCK), call_args)
-    push!(prekernel_instr, cuda_call)
-    push!(prekernel_instr, @parse cudaStreamSynchronize(starpu_cuda_get_local_stream()))
-    prekernel_instr = vcat(prekernel_instr, finish)
+    cpu_instr = vcat(cpu_instr, finish)
 
     prekernel_name = Symbol("CUDA_", func.func)
-    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, StarpuExprBlock(prekernel_instr))
+    prekernel = StarpuExprFunction(Nothing, prekernel_name, cpu_func.args, StarpuExprBlock(cpu_instr))
+    prekernel = translate_cublas(prekernel)
     prekernel = flatten_blocks(prekernel)
 
-    kernel = StarpuExprFunction(Nothing, kernelname, kernel_args, StarpuExprBlock(kernel_instr))
-    kernel = add_device_to_interval_call(kernel)
-    kernel = flatten_blocks(kernel)
-    
     return prekernel, kernel
 end
 

+ 12 - 6
julia/src/compiler/file_generation.jl

@@ -18,6 +18,8 @@ const cpu_kernel_file_start = "#include <stdio.h>
 #include <starpu.h>
 #include <math.h>
 
+#include \"blas.h\"
+
 static inline long long jlstarpu_max(long long a, long long b)
 {
 	return (a > b) ? a : b;
@@ -38,15 +40,16 @@ const cuda_kernel_file_start = "#include <stdio.h>
 #include <stdint.h>
 #include <starpu.h>
 #include <math.h>
+#include <starpu_cublas_v2.h>
 
 #define THREADS_PER_BLOCK 64
 
-static inline long long jlstarpu_max(long long a, long long b)
+__attribute__((unused)) static inline long long jlstarpu_max(long long a, long long b)
 {
 	return (a > b) ? a : b;
 }
 
-static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
+__attribute__((unused)) static inline long long jlstarpu_interval_size(long long start, long long step, long long stop)
 {
     if (stop >= start){
             return jlstarpu_max(0, (stop - start + 1) / step);
@@ -56,12 +59,12 @@ static inline long long jlstarpu_interval_size(long long start, long long step,
 }
 
 
-__device__ static inline long long jlstarpu_max__device(long long a, long long b)
+__attribute__((unused)) __device__ static inline long long jlstarpu_max__device(long long a, long long b)
 {
 	return (a > b) ? a : b;
 }
 
-__device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
+__attribute__((unused)) __device__ static inline long long jlstarpu_interval_size__device(long long start, long long step, long long stop)
 {
 	if (stop >= start){
 		return jlstarpu_max__device(0, (stop - start + 1) / step);
@@ -70,7 +73,6 @@ __device__ static inline long long jlstarpu_interval_size__device(long long star
 	}
 }
 
-
 "
 
 """
@@ -130,7 +132,11 @@ macro codelet(x)
         kernel_file = open(generated_cuda_kernel_file_name, "w")
         debug_print("generating ", generated_cuda_kernel_file_name)
         print(kernel_file, cuda_kernel_file_start)
-        print(kernel_file, "__global__ ", kernel)
+
+        if kernel != nothing
+            print(kernel_file, "__global__ ", kernel)
+        end
+
         print(kernel_file, c_struct_param_decl)
         print(kernel_file, "\nextern \"C\" ", prekernel)
         close(kernel_file)