Forráskód Böngészése

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu

Samuel Thibault 5 éve
szülő
commit
05521004ac

+ 33 - 7
julia/StarPU.jl/src/StarPU.jl

@@ -473,7 +473,7 @@ mutable struct StarpuTask
     handles :: Vector{StarpuDataHandle}
     handle_pointers :: Vector{StarpuDataHandlePointer}
     synchronous :: Bool
-    cl_arg :: Union{Ref, Cvoid}
+    cl_arg # type depends on codelet
 
     c_task :: Ptr{Cvoid}
 
@@ -483,7 +483,7 @@ mutable struct StarpuTask
 
         Creates a new task which will run the specified codelet on handle buffers and cl_args data
     """
-    function StarpuTask(; cl :: Union{Cvoid, StarpuCodelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg :: Union{Ref, Cvoid} = nothing)
+    function StarpuTask(; cl :: Union{Cvoid, StarpuCodelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg = [])
 
         if (cl == nothing)
             error("\"cl\" field can't be empty when creating a StarpuTask")
@@ -493,7 +493,16 @@ mutable struct StarpuTask
 
         output.cl = cl
         output.handles = handles
-        output.cl_arg = cl_arg
+
+        # handle scalar_parameters
+        codelet_name = cl.cpu_func
+        scalar_parameters = CODELETS_SCALARS[codelet_name]
+        nb_scalar_required = length(scalar_parameters)
+        nb_scalar_provided = length(cl_arg)
+        if (nb_scalar_provided != nb_scalar_required)
+            error("$nb_scalar_provided scalar parameters provided but $nb_scalar_required are required by $codelet_name.")
+        end
+        output.cl_arg = create_param_struct_from_clarg(codelet_name, cl_arg)
 
         output.synchronous = false
         output.handle_pointers = StarpuDataHandlePointer[]
@@ -513,6 +522,23 @@ mutable struct StarpuTask
 
 end
 
+function create_param_struct_from_clarg(codelet_name, cl_arg)
+    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
+    nb_scalar_provided = length(cl_arg)
+    create_struct_param_str = "output = $struct_params_name("
+    for i in 1:nb_scalar_provided-1
+        arg = cl_arg[i]
+        create_struct_param_str *= "$arg, "
+        end
+    if (nb_scalar_provided > 0)
+        arg = cl_arg[nb_scalar_provided]
+        create_struct_param_str *= "$arg"
+    end
+    create_struct_param_str *= ")"
+    eval(Meta.parse(create_struct_param_str))
+    return output
+end
+
 """
     Structure used to update fields of the real C task structure 
 """
@@ -539,8 +565,8 @@ mutable struct StarpuTaskTranslator
             output.cl_arg = C_NULL
             output.cl_arg_size = 0
         else
-            output.cl_arg = pointer_from_objref(task.cl_arg) #TODO : Libc.malloc and cl_arg_free set to 1 ? but it should be done only when submitting
-            output.cl_arg_size = sizeof(eltype(task.cl_arg))
+            output.cl_arg = pointer_from_objref(task.cl_arg)
+            output.cl_arg_size = sizeof(task.cl_arg)
         end
 
         return output
@@ -815,7 +841,7 @@ end
     Creates and submits an asynchronous task running cl Codelet function.
     Ex : @starpu_async_cl cl(handle1, handle2)
 """
-macro starpu_async_cl(expr,modes)
+macro starpu_async_cl(expr,modes,cl_arg=[])
 
     if (!isa(expr, Expr) || expr.head != :call)
         error("Invalid task submit syntax")
@@ -839,7 +865,7 @@ macro starpu_async_cl(expr,modes)
     handles = Expr(:vect, expr.args[2:end]...)
     #dump(handles)
     quote
-        task = StarpuTask(cl = $(esc(cl)), handles = $(esc(handles)))
+        task = StarpuTask(cl = $(esc(cl)), handles = $(esc(handles)), cl_arg=cl_arg)
         starpu_task_submit(task)
     end
 end

+ 44 - 31
julia/StarPU.jl/src/compiler/c.jl

@@ -1,5 +1,3 @@
-
-
 """
     Returns the list of instruction that will be added before for loop of shape
         "for for_index_var in set ..."
@@ -56,12 +54,7 @@ function add_for_loop_declarations(expr :: StarpuExpr)
     return apply(func_to_apply, expr)
 end
 
-
-
-
-
 function transform_to_cpu_kernel(expr :: StarpuExprFunction)
-
     output = add_for_loop_declarations(expr)
     output = substitute_args(output)
     output = substitute_func_calls(output)
@@ -71,7 +64,20 @@ function transform_to_cpu_kernel(expr :: StarpuExprFunction)
     return output
 end
 
+function generate_c_struct_param_declaration(funcname)
+    scalar_parameters = CODELETS_SCALARS[funcname]
+    struct_params_name = CODELETS_PARAMS_STRUCT[funcname]
+
+    output = "struct $struct_params_name {\n"
+    for p in scalar_parameters
+        arg_name = p[1]
+        arg_type = p[2]
+        output *= "\t" * starpu_type_traduction(arg_type) * " $arg_name;\n"
+    end
+    output *= "};\n\n"
 
+    return output
+end
 
 function flatten_blocks(expr :: StarpuExpr)
 
@@ -130,46 +136,55 @@ end
 
 
 function substitute_args(expr :: StarpuExprFunction)
-
     new_body = expr.body
     func_id = rand_string()
     buffer_arg_name = Symbol("buffers_", func_id)
     cl_arg_name = Symbol("cl_arg_", func_id)
-    post = false
     function_start_affectations = StarpuExpr[]
 
+    buffer_id = 1
+    scalar_id = 1
+
+    # get scalar parameters and structure name
+    scalar_parameters = CODELETS_SCALARS[string(expr.func)]
+    struct_params_name = CODELETS_PARAMS_STRUCT[string(expr.func)]
+
     for i in (1 : length(expr.args))
 
         var_id = rand_string()
         ptr = Symbol(:ptr_, var_id)
         var_name = ptr
-        
+
         if (expr.args[i].typ <: Vector)
             func_interface = :STARPU_VECTOR_GET_PTR
+            type_in_arg = eltype(expr.args[i].typ)
+            new_affect = starpu_parse( :($ptr :: Ptr{$type_in_arg} = $func_interface($buffer_arg_name[$buffer_id])) )
+            push!(function_start_affectations, new_affect)
+            new_body = substitute_argument_usage(new_body, buffer_id, buffer_arg_name, expr.args[i].name, var_name)
+            buffer_id += 1
         elseif (expr.args[i].typ <: Matrix)
             func_interface = :STARPU_MATRIX_GET_PTR
             ld_name = Symbol("ld_", var_id)
-            post_affect = starpu_parse( :($ld_name :: UInt32 = STARPU_MATRIX_GET_LD($buffer_arg_name[$i])) )
-            post=true
-            
-        elseif (expr.args[i].typ <: Float32)
-            func_interface = :STARPU_VARIABLE_GET_PTR
-            var_name = Symbol("scal_", var_id)
-            post_affect = starpu_parse( :($var_name :: Float32 = ($ptr[0])) )
-            post = true
-            
-        end
-        #else
-            #error("Task arguments must be either vector or matrix (got $(expr.args[i].typ))") #TODO : cl_args, variable ?
-        #end
-
-        type_in_arg = eltype(expr.args[i].typ)
-        new_affect = starpu_parse( :($ptr :: Ptr{$type_in_arg} = $func_interface($buffer_arg_name[$i])) )
-        push!(function_start_affectations, new_affect)
-        if (post)
+            post_affect = starpu_parse( :($ld_name :: UInt32 = STARPU_MATRIX_GET_LD($buffer_arg_name[$buffer_id])) )
+            type_in_arg = eltype(expr.args[i].typ)
+            new_affect = starpu_parse( :($ptr :: Ptr{$type_in_arg} = $func_interface($buffer_arg_name[$buffer_id])) )
+            push!(function_start_affectations, new_affect)
+            push!(function_start_affectations, post_affect)
+            new_body = substitute_argument_usage(new_body, buffer_id, buffer_arg_name, expr.args[i].name, var_name)
+            buffer_id += 1
+        elseif (expr.args[i].typ <: Number || expr.args[i].typ <: AbstractChar)
+            type_in_arg = eltype(expr.args[i].typ)
+            field_name = scalar_parameters[scalar_id][1]
+            var_name = field_name
+            post_affect = starpu_parse( :($var_name :: $type_in_arg = *($ptr).$field_name))
+            new_affect = starpu_parse( :($ptr :: Ptr{$struct_params_name} = $cl_arg_name))
+            push!(function_start_affectations, new_affect)
             push!(function_start_affectations, post_affect)
+            scalar_id += 1
+        else
+            error("Task arguments must be either vector or matrix or scalr (got $(expr.args[i].typ))")
         end
-        new_body = substitute_argument_usage(new_body, i, buffer_arg_name, expr.args[i].name, var_name)
+
 
     end
 
@@ -183,8 +198,6 @@ function substitute_args(expr :: StarpuExprFunction)
     return StarpuExprFunction(expr.ret_type, expr.func, new_args, new_body)
 end
 
-
-
 func_substitution = Dict(
     :width => :STARPU_MATRIX_GET_NY,
     :height => :STARPU_MATRIX_GET_NX,

+ 11 - 16
julia/StarPU.jl/src/compiler/expressions.jl

@@ -1,3 +1,14 @@
+global starpu_type_traduction_dict = Dict(
+    Int32 => "int32_t",
+    UInt32 => "uint32_t",
+    Float32 => "float",
+    Int64 => "int64_t",
+    UInt64 => "uint64_t",
+    Float64 => "double",
+    Nothing => "void"
+)
+export starpu_type_traduction_dict
+
 
 #======================================================
                 AFFECTATION
@@ -841,22 +852,6 @@ function starpu_parse_typed(x :: Expr)
     return StarpuExprTypedExpr(expr, typ)
 end
 
-
-
-
-
-starpu_type_traduction_dict = Dict(
-    Int32 => "int32_t",
-    UInt32 => "uint32_t",
-    Float32 => "float",
-    Int64 => "int64_t",
-    UInt64 => "uint64_t",
-    Float64 => "double",
-    Nothing => "void"
-)
-
-
-
 function starpu_type_traduction(x)
     if x <: Array
         return starpu_type_traduction_array(x)

+ 35 - 0
julia/StarPU.jl/src/compiler/file_generation.jl

@@ -95,6 +95,11 @@ global CPU_CODELETS=Dict{String,String}()
 export CUDA_CODELETS
 global CUDA_CODELETS=Dict{String,String}()
 
+export CODELETS_SCALARS
+global CODELETS_SCALARS=Dict{String,Any}()
+export CODELETS_PARAMS_STRUCT
+global CODELETS_PARAMS_STRUCT=Dict{String,Any}()
+
 """
 	    Executes @cuda_kernel and @cpu_kernel
         """
@@ -102,6 +107,7 @@ macro codelet(x)
     parsed = starpu_parse(x)
     name=string(x.args[1].args[1].args[1]);
     dump(name)
+    parse_scalar_parameters(parsed, name)
     cpu_expr = transform_to_cpu_kernel(parsed)
     prekernel, kernel = transform_to_cuda_kernel(parsed)
     generated_cpu_kernel_file_name=string("genc_",string(x.args[1].args[1].args[1]),".c")
@@ -113,6 +119,7 @@ macro codelet(x)
             kernel_file = open($(esc(generated_cpu_kernel_file_name)), "w")
             @debugprint "generating " $(generated_cpu_kernel_file_name)
             print(kernel_file, $(esc(cpu_kernel_file_start)))
+            print(kernel_file, generate_c_struct_param_declaration($name))
             print(kernel_file, $cpu_expr)
             close(kernel_file)
             CPU_CODELETS[$name]=$name
@@ -132,3 +139,31 @@ macro codelet(x)
         #global starpu_task_library_name
     end
 end
+
+function parse_scalar_parameters(expr :: StarpuExprFunction, name::String)
+    scalar_parameters = []
+    for i in (1 : length(expr.args))
+        type = expr.args[i].typ
+        if (type <: Number || type <: AbstractChar)
+            push!(scalar_parameters, (expr.args[i].name, type))
+        end
+    end
+
+    CODELETS_SCALARS[name] = scalar_parameters
+
+    # declare structure carrying scalar parameters
+    struct_params_name = Symbol("params_", rand_string())
+    structure_decl_str = "mutable struct " * "$struct_params_name\n"
+    for p in scalar_parameters
+        structure_decl_str *= "$(p[1])::$(p[2])\n"
+    end
+    structure_decl_str *= "end"
+    eval(Meta.parse(structure_decl_str))
+
+    # add structure type to dictionnary
+    add_to_dict_str = "starpu_type_traduction_dict[$struct_params_name] = \"struct $struct_params_name\""
+    eval(Meta.parse(add_to_dict_str))
+
+    # save structure name
+    CODELETS_PARAMS_STRUCT[name] = struct_params_name
+end

+ 50 - 0
julia/vector_scal/Makefile

@@ -0,0 +1,50 @@
+# ICC compiler
+#CC =icc
+#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
+# GCC compiler
+CC=gcc
+CFLAGS += -g -O3 -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+EXTERNLIB=extern_tasks.so
+GENERATEDLIB=generated_tasks.so
+OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+LIBPATH=${PWD}/../StarPU.jl/lib
+
+all: ${EXTERNLIB}
+
+vector_scal: vector_scal.c cpu_vector_scal.o #gpu_vector_scal.o
+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+
+gpu_vector_scal.o: gpu_vector_scal.cu
+	nvcc -c $(CFLAGS) $^ -o $@
+
+%.o: %.c
+	$(CC) -c $(CFLAGS) $^ -o $@
+
+${EXTERNLIB}: cpu_vector_scal.c
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+gpu_vector_scal.so: gpu_vector_scal.o
+	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+
+cpu_vector_scal_sa: cpu_vector_scal_sa.o
+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+
+${GENERATEDLIB}: ${OBJECTS}
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+clean:
+	rm -f vector_scal *.so *.o c_*.genc gencuda_*.cu *.dat
+
+# Performance Tests
+cstarpu.dat: vector_scal
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./vector_scal > $@
+julia_generatedc.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl $@
+julia_native.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal_native.jl $@
+julia_calllib.dat: ${EXTERNLIB}
+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl julia_calllib.dat
+
+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 42 - 0
julia/vector_scal/cpu_vector_scal.c

@@ -0,0 +1,42 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+#include <math.h>
+
+struct params {
+  int32_t m;
+  float k;
+  float l;
+};
+
+float vector_scal(void *buffers[], void *cl_arg)
+{
+  /* get scalar parameters from cl_arg */
+  struct params *scalars = (struct params *) cl_arg;
+  int m = scalars->m;
+  float k = scalars->k;
+  float l = scalars->l;
+
+  struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
+
+  /* length of the vector */
+  unsigned n = STARPU_VECTOR_GET_NX(vector);
+
+  /* get a pointer to the local copy of the vector : note that we have to
+   * cast it in (float *) since a vector could contain any type of
+   * elements so that the .ptr field is actually a uintptr_t */
+  float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+  /* scale the vector */
+  for (unsigned i = 0; i < n; i++)
+    val[i] = val[i] * k + l + m;
+
+  return 0.0;
+}
+
+char* CPU = "cpu_mult";
+char* GPU = "gpu_mult";
+extern char *starpu_find_function(char *name, char *device) {
+	if (!strcmp(device,"gpu")) return GPU;
+	return CPU;
+}

+ 78 - 0
julia/vector_scal/vector_scal.jl

@@ -0,0 +1,78 @@
+import Libdl
+using StarPU
+using LinearAlgebra
+
+@target STARPU_CPU+STARPU_CUDA
+@codelet function vector_scal(m::Int32, v :: Vector{Float32}, k :: Float32, l :: Float32) :: Float32
+
+    N :: Int32 = length(v)
+    # Naive version
+    @parallel for i in (1 : N)
+        v[i] = v[i] * k + l + m
+    end
+
+    return 0. :: Float32
+end
+
+
+@debugprint "starpu_init"
+starpu_init()
+
+function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32, l :: Float32)
+    tmin=0
+
+    @starpu_block let
+        hV = starpu_data_register(v)
+        tmin=0
+        perfmodel = StarpuPerfmodel(
+            perf_type = STARPU_HISTORY_BASED,
+            symbol = "history_perf"
+        )
+        cl = StarpuCodelet(
+            cpu_func = CPU_CODELETS["vector_scal"],
+            #cuda_func = "matrix_mult",
+            #opencl_func="ocl_matrix_mult",
+            modes = [STARPU_RW],
+            perfmodel = perfmodel
+        )
+
+        for i in (1 : 10)
+            t=time_ns()
+            @starpu_sync_tasks begin
+                handles = [hV]
+                task = StarpuTask(cl = cl, handles = handles, cl_arg=[m, k, l])
+                starpu_task_submit(task)
+            end
+            # @starpu_sync_tasks for task in (1:1)
+            #     @starpu_async_cl vector_scal(hV, STARPU_RW, [m, k, l])
+            # end
+            t=time_ns()-t
+            if (tmin==0 || tmin>t)
+                tmin=t
+            end
+        end
+    end
+    return tmin
+end
+
+function compute_times(io,start_dim, step_dim, stop_dim)
+    for size in (start_dim : step_dim : stop_dim)
+        V = Array(rand(Cfloat, size))
+        m :: Int32 = 10
+        k :: Float32 = 2.
+        l :: Float32 = 3.
+        println("INPUT ", V[1:10])
+        mt =  vector_scal_with_starpu(V, m, k, l)
+        println("OUTPUT ", V[1:10])
+        println(io,"$size $mt")
+        println("$size $mt")
+    end
+end
+
+
+io=open(ARGS[1],"w")
+compute_times(io,1024,1024,4096)
+close(io)
+@debugprint "starpu_shutdown"
+starpu_shutdown()
+