ソースを参照

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu

Samuel Thibault 5 年 前
コミット
06c4b71ab5

+ 28 - 10
julia/StarPU.jl/src/StarPU.jl

@@ -496,13 +496,26 @@ mutable struct StarpuTask
 
         # handle scalar_parameters
         codelet_name = cl.cpu_func
-        scalar_parameters = CODELETS_SCALARS[codelet_name]
-        nb_scalar_required = length(scalar_parameters)
-        nb_scalar_provided = length(cl_arg)
-        if (nb_scalar_provided != nb_scalar_required)
-            error("$nb_scalar_provided scalar parameters provided but $nb_scalar_required are required by $codelet_name.")
+        if isempty(codelet_name)
+            codelet_name = cl.cuda_func
+        end
+        if isempty(codelet_name)
+            codelet_name = cl.opencl_func
+        end
+        if isempty(codelet_name)
+            error("No function provided with codelet.")
+        end
+        scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
+        if scalar_parameters != nothing
+            nb_scalar_required = length(scalar_parameters)
+            nb_scalar_provided = length(cl_arg)
+            if (nb_scalar_provided != nb_scalar_required)
+                error("$nb_scalar_provided scalar parameters provided but $nb_scalar_required are required by $codelet_name.")
+            end
+            output.cl_arg = create_param_struct_from_clarg(codelet_name, cl_arg)
+        else
+            output.cl_arg = nothing
         end
-        output.cl_arg = create_param_struct_from_clarg(codelet_name, cl_arg)
 
         output.synchronous = false
         output.handle_pointers = StarpuDataHandlePointer[]
@@ -522,8 +535,13 @@ mutable struct StarpuTask
 
 end
 
-function create_param_struct_from_clarg(codelet_name, cl_arg)
-    struct_params_name = CODELETS_PARAMS_STRUCT[codelet_name]
+function create_param_struct_from_clarg(name, cl_arg)
+    struct_params_name = CODELETS_PARAMS_STRUCT[name]
+
+    if struct_params_name == false
+        error("structure name not found in CODELET_PARAMS_STRUCT")
+    end
+
     nb_scalar_provided = length(cl_arg)
     create_struct_param_str = "output = $struct_params_name("
     for i in 1:nb_scalar_provided-1
@@ -856,7 +874,7 @@ macro starpu_async_cl(expr,modes,cl_arg=[])
     println(CPU_CODELETS[string(expr.args[1])])
     cl = StarpuCodelet(
         cpu_func = CPU_CODELETS[string(expr.args[1])],
-        #cuda_func = "matrix_mult",
+        # cuda_func = CUDA_CODELETS[string(expr.args[1])],
         #opencl_func="ocl_matrix_mult",
         ### TODO: CORRECT !
         modes = map((x -> starpu_modes(x)),modes.args),
@@ -865,7 +883,7 @@ macro starpu_async_cl(expr,modes,cl_arg=[])
     handles = Expr(:vect, expr.args[2:end]...)
     #dump(handles)
     quote
-        task = StarpuTask(cl = $(esc(cl)), handles = $(esc(handles)), cl_arg=cl_arg)
+        task = StarpuTask(cl = $(esc(cl)), handles = $(esc(handles)), cl_arg=$(esc(cl_arg)))
         starpu_task_submit(task)
     end
 end

+ 13 - 7
julia/StarPU.jl/src/compiler/file_generation.jl

@@ -106,8 +106,11 @@ global CODELETS_PARAMS_STRUCT=Dict{String,Any}()
 macro codelet(x)
     parsed = starpu_parse(x)
     name=string(x.args[1].args[1].args[1]);
+    cpu_name = name
+    cuda_name = "CUDA_"*name
     dump(name)
-    parse_scalar_parameters(parsed, name)
+    parse_scalar_parameters(parsed, cpu_name, cuda_name)
+    c_struct_param_decl = generate_c_struct_param_declaration(name)
     cpu_expr = transform_to_cpu_kernel(parsed)
     prekernel, kernel = transform_to_cuda_kernel(parsed)
     generated_cpu_kernel_file_name=string("genc_",string(x.args[1].args[1].args[1]),".c")
@@ -119,10 +122,10 @@ macro codelet(x)
             kernel_file = open($(esc(generated_cpu_kernel_file_name)), "w")
             @debugprint "generating " $(generated_cpu_kernel_file_name)
             print(kernel_file, $(esc(cpu_kernel_file_start)))
-            print(kernel_file, generate_c_struct_param_declaration($name))
+            print(kernel_file, $c_struct_param_decl)
             print(kernel_file, $cpu_expr)
             close(kernel_file)
-            CPU_CODELETS[$name]=$name
+            CPU_CODELETS[$name]=$cpu_name
         end
         
         if ($targets&$STARPU_CUDA!=0)
@@ -130,9 +133,10 @@ macro codelet(x)
             @debugprint "generating " $(generated_cuda_kernel_file_name)
             print(kernel_file, $(esc(cuda_kernel_file_start)))
             print(kernel_file, "__global__ ", $kernel)
+            print(kernel_file, $c_struct_param_decl) # TODO: extern C ?
             print(kernel_file, "\nextern \"C\" ", $prekernel)
             close(kernel_file)
-            CUDA_CODELETS[$name]="CUDA_"*$name
+            CUDA_CODELETS[$name]=$cuda_name
         end
         print("end generation")
         #starpu_task_library_name="generated_tasks"
@@ -140,7 +144,7 @@ macro codelet(x)
     end
 end
 
-function parse_scalar_parameters(expr :: StarpuExprFunction, name::String)
+function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, cuda_name::String)
     scalar_parameters = []
     for i in (1 : length(expr.args))
         type = expr.args[i].typ
@@ -149,7 +153,8 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, name::String)
         end
     end
 
-    CODELETS_SCALARS[name] = scalar_parameters
+    CODELETS_SCALARS[cpu_name] = scalar_parameters
+    CODELETS_SCALARS[cuda_name] = scalar_parameters
 
     # declare structure carrying scalar parameters
     struct_params_name = Symbol("params_", rand_string())
@@ -165,5 +170,6 @@ function parse_scalar_parameters(expr :: StarpuExprFunction, name::String)
     eval(Meta.parse(add_to_dict_str))
 
     # save structure name
-    CODELETS_PARAMS_STRUCT[name] = struct_params_name
+    CODELETS_PARAMS_STRUCT[cpu_name] = struct_params_name
+    CODELETS_PARAMS_STRUCT[cuda_name] = struct_params_name
 end

+ 28 - 17
julia/mandelbrot/Makefile

@@ -1,38 +1,49 @@
 CC=gcc
-CFLAGS += -Wall -Wextra -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
+
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
 
-LDFLAGS +=$(shell pkg-config --libs starpu-1.3) -lm
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
-#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-OBJECTS=$(wildcard gen*.c)
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
 LIBPATH=${PWD}/../StarPU.jl/lib
 
 all: ${EXTERNLIB}
 
 mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-gpu_mandelbrot.o: gpu_mandelbrot.cu
-	nvcc -c $(CFLAGS) $^ -o $@
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: cpu_mandelbrot.c
 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
-gpu_mandelbrot.so: gpu_mandelbrot.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
 
-cpu_mandelbrot_sa: cpu_mandelbrot_sa.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+.PHONY: clean
 
 clean:
-	rm -f mandelbrot *.so *.o c_*.genc gencuda_*.cu *.dat
+	rm -f mandelbrot *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: mandelbrot

+ 12 - 20
julia/mandelbrot/mandelbrot.jl

@@ -3,7 +3,7 @@ using StarPU
 using LinearAlgebra
 
 @target STARPU_CPU+STARPU_CUDA
-@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Float32
+@codelet function mandelbrot(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64, offset ::Int64, dim ::Int64 ) :: Nothing
     height :: Int64 = height(pixels)
     width :: Int64 = width(pixels)
     zoom :: Float64 = width * 0.25296875
@@ -11,10 +11,6 @@ using LinearAlgebra
     diverge :: Float32 = 4.0
     max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
     imi :: Float32 = 1. / max_iterations
-    centerr :: Float32 = params[1,1]
-    centeri :: Float32 = params[2,1]
-    offset :: Float32 = params[3,1]
-    dim :: Float32 = params[4,1]
     cr :: Float64 = 0.
     zr :: Float64 = 0.
     ci :: Float64 = 0.
@@ -27,7 +23,10 @@ using LinearAlgebra
             zr = cr
             ci = centeri + (y-1+offset - (dim / 2)) * iz
             zi = ci
-            for n = 0:max_iterations
+            max_it :: Float64 = max_iterations
+            n = 0
+            for i = 0:max_it
+                n = i
                 if (zr*zr + zi*zi > diverge)
                     break
                 end
@@ -43,21 +42,21 @@ using LinearAlgebra
             end
         end
     end
-    return 0. :: Float32
+
+    return
 end
 
 @debugprint "starpu_init"
 starpu_init()
 
-function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
+function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
     horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
     @starpu_block let
-	hA, hP = starpu_data_register(A,params)
+	hA = starpu_data_register(A)
 	starpu_data_partition(hA,horiz)
-        starpu_data_partition(hP,horiz)
-        
+
 	@starpu_sync_tasks for taskx in (1 : nslicesx)
-                @starpu_async_cl mandelbrot(hA[taskx], hP[taskx]) [STARPU_W, STARPU_R]
+                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] [cr, ci, (taskx-1)*dim/nslicesx, dim]
 	end
     end
 end
@@ -79,16 +78,9 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
     tmin=0;
     
     pixels ::Matrix{Int64} = zeros(dim, dim)
-    params :: Matrix{Float32} = zeros(4*nslices,1)
-    for i=0:(nslices-1)
-        params[4*i+1,1] = cr
-        params[4*i+2,1] = ci
-        params[4*i+3,1] = i*dim/nslices
-        params[4*i+4,1] = dim
-    end
     for i = 1:10
         t = time_ns();
-        mandelbrot_with_starpu(pixels, params, nslices)
+        mandelbrot_with_starpu(pixels, cr, ci, dim, nslices)
         t = time_ns()-t
         if (tmin==0 || tmin>t)
             tmin=t

+ 10 - 20
julia/mandelbrot/mandelbrot_native.jl

@@ -1,16 +1,12 @@
 using LinearAlgebra
 
-function mandelbrot(pixels, params) :: Float32
+function mandelbrot(pixels, centerr ::Float64, centeri ::Float64, offset ::Int64, dim ::Int64) :: Nothing
     height :: Int64, width :: Int64 = size(pixels)
     zoom :: Float64 = width * 0.25296875
     iz :: Float64 = 1. / zoom
     diverge :: Float32 = 4.0
     max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
-    imi :: Float32 = 1. / max_iterations
-    centerr :: Float32 = params[1]
-    centeri :: Float32 = params[2]
-    offset :: Float32 = params[3]
-    dim :: Float32 = params[4]
+    imi :: Float64 = 1. / max_iterations
     cr :: Float64 = 0.
     zr :: Float64 = 0.
     ci :: Float64 = 0.
@@ -23,7 +19,9 @@ function mandelbrot(pixels, params) :: Float32
             zr = cr
             ci = centeri + (y-1+offset - (dim / 2)) * iz
             zi = ci
-            for n = 0:max_iterations
+            n = 0
+            for i = 0:max_iterations
+                n = i
                 if (zr*zr + zi*zi > diverge)
                     break
                 end
@@ -40,11 +38,10 @@ function mandelbrot(pixels, params) :: Float32
         end
     end
 
-    ret :: Float32 = 0.
-    return ret
+    return
 end
 
-function mandelbrot_without_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
+function mandelbrot_without_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
     width,height = size(A)
     step = height / nslicesx
 
@@ -52,9 +49,9 @@ function mandelbrot_without_starpu(A ::Matrix{Int64}, params ::Matrix{Float32},
         start_id = floor(Int64, (taskx-1)*step+1)
         end_id = floor(Int64, (taskx-1)*step+step)
         a = view(A, start_id:end_id, :)
-        p = view(params, (taskx-1)*4+1:(taskx-1)*4+4)
 
-        mandelbrot(a, p)
+        offset ::Int64 = (taskx-1)*dim/nslicesx
+        mandelbrot(a, cr, ci, offset, dim)
     end
 end
 
@@ -75,16 +72,9 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
     tmin=0;
 
     pixels ::Matrix{Int64} = zeros(dim, dim)
-    params :: Matrix{Float32} = zeros(4*nslices,1)
-    for i=0:(nslices-1)
-        params[4*i+1,1] = cr
-        params[4*i+2,1] = ci
-        params[4*i+3,1] = i*dim/nslices
-        params[4*i+4,1] = dim
-    end
     for i = 1:10
         t = time_ns();
-        mandelbrot_without_starpu(pixels, params, nslices)
+        mandelbrot_without_starpu(pixels, cr, ci, dim, nslices)
         t = time_ns()-t
         if (tmin==0 || tmin>t)
             tmin=t

+ 28 - 16
julia/mult/Makefile

@@ -6,40 +6,52 @@ STRIDE=72
 #CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
 # GCC compiler
 CC=gcc
-CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
 
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g -DSTRIDE=${STRIDE} $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
-#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-OBJECTS=$(wildcard gen*.c)
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
+
 LIBPATH=${PWD}/../StarPU.jl/lib
 
 all: ${EXTERNLIB}
 
 mult: mult.c cpu_mult.o #gpu_mult.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-gpu_mult.o: gpu_mult.cu
-	nvcc -c $(CFLAGS) $^ -o $@
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: cpu_mult.c
 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
-gpu_mult.so: gpu_mult.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
 
-cpu_mult_sa: cpu_mult_sa.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+.PHONY: clean
 
 clean:
-	rm -f mult *.so *.o c_*.genc gencuda_*.cu *.dat
+	rm -f mult *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: mult

+ 42 - 42
julia/mult/mult.jl

@@ -6,54 +6,54 @@ using LinearAlgebra
 const STRIDE = 72
 
 @target STARPU_CPU+STARPU_CUDA
-@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Float32
+@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
 
     width_m2 :: Int32 = width(m2)
     height_m1 :: Int32 = height(m1)
     width_m1 :: Int32 = width(m1)
     # Naive version
-    #@parallel for j in (1 : width_m2)
-    #    @parallel for i in (1 : height_m1)
-    #
-    #          sum :: Float32 = 0.
-
-    #          for k in (1 : width_m1)
-    #              sum = sum + m1[i, k] * m2[k, j]
-    #          end
+    @parallel for j in (1 : width_m2)
+       @parallel for i in (1 : height_m1)
     
-    #          m3[i, j] = sum
-    #      end
-    #  end
-    ##### Tiled and unrolled version 
-    for l in (1 : width_m2)
-        for m in (1 : height_m1)
-            m3[m,l] = 0
-        end
-    end
-    @parallel for i in (1 : STRIDE : height_m1)
-        for k in (1 : STRIDE : width_m1 )
-            for j in (1 : STRIDE : width_m2  )
-                for kk in (k : 4 : k+STRIDE-1)
-                    for jj in (j : 2 : j+STRIDE-1)
-                        alpha00 :: Float32 =m2[kk,jj]
-                        alpha01 :: Float32 =m2[kk,jj+1]
-                        alpha10 :: Float32 =m2[kk+1,jj]
-                        alpha11 :: Float32 =m2[kk+1,jj+1]
-                        alpha20 :: Float32 =m2[kk+2,jj]
-                        alpha21 :: Float32 =m2[kk+2,jj+1]
-                        alpha30 :: Float32 =m2[kk+3,jj]
-                        alpha31 :: Float32 =m2[kk+3,jj+1]
-                        for ii in (i : 1 : i+STRIDE-1) 
-                            m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
-                            m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
-                        end
-                    end
-                end
-            end
-        end
-    end
+             sum :: Float32 = 0.
 
-    return 0. :: Float32
+             for k in (1 : width_m1)
+                 sum = sum + m1[i, k] * m2[k, j]
+             end
+    
+             m3[i, j] = sum
+         end
+     end
+    # ##### Tiled and unrolled version 
+    # for l in (1 : width_m2)
+    #     for m in (1 : height_m1)
+    #         m3[m,l] = 0
+    #     end
+    # end
+    # @parallel for i in (1 : STRIDE : height_m1)
+    #     for k in (1 : STRIDE : width_m1 )
+    #         for j in (1 : STRIDE : width_m2  )
+    #             for kk in (k : 4 : k+STRIDE-1)
+    #                 for jj in (j : 2 : j+STRIDE-1)
+    #                     alpha00 :: Float32 =m2[kk,jj]
+    #                     alpha01 :: Float32 =m2[kk,jj+1]
+    #                     alpha10 :: Float32 =m2[kk+1,jj]
+    #                     alpha11 :: Float32 =m2[kk+1,jj+1]
+    #                     alpha20 :: Float32 =m2[kk+2,jj]
+    #                     alpha21 :: Float32 =m2[kk+2,jj+1]
+    #                     alpha30 :: Float32 =m2[kk+3,jj]
+    #                     alpha31 :: Float32 =m2[kk+3,jj+1]
+    #                     for ii in (i : 1 : i+STRIDE-1) 
+    #                         m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
+    #                         m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
+    #                     end
+    #                 end
+    #             end
+    #         end
+    #     end
+    # end
+
+    return
 end
 
 
@@ -77,7 +77,7 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
         )
         cl = StarpuCodelet(
             cpu_func = CPU_CODELETS["matrix_mult"],
-            #cuda_func = "matrix_mult",
+            # cuda_func = CUDA_CODELETS["matrix_mult"],
             #opencl_func="ocl_matrix_mult",
             modes = [STARPU_R, STARPU_R, STARPU_W],
             perfmodel = perfmodel

+ 27 - 19
julia/vector_scal/Makefile

@@ -1,41 +1,49 @@
-# ICC compiler
-#CC =icc
-#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
-# GCC compiler
 CC=gcc
-CFLAGS += -g -O3 -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
 
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
-OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
 LIBPATH=${PWD}/../StarPU.jl/lib
 
 all: ${EXTERNLIB}
 
 vector_scal: vector_scal.c cpu_vector_scal.o #gpu_vector_scal.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-gpu_vector_scal.o: gpu_vector_scal.cu
-	nvcc -c $(CFLAGS) $^ -o $@
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: cpu_vector_scal.c
 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
-gpu_vector_scal.so: gpu_vector_scal.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
 
-cpu_vector_scal_sa: cpu_vector_scal_sa.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+PHONY: clean
 
 clean:
-	rm -f vector_scal *.so *.o c_*.genc gencuda_*.cu *.dat
+	rm -f vector_scal *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: vector_scal

+ 3 - 3
julia/vector_scal/cpu_vector_scal.c

@@ -9,7 +9,7 @@ struct params {
   float l;
 };
 
-float vector_scal(void *buffers[], void *cl_arg)
+float cpu_vector_scal(void *buffers[], void *cl_arg)
 {
   /* get scalar parameters from cl_arg */
   struct params *scalars = (struct params *) cl_arg;
@@ -34,8 +34,8 @@ float vector_scal(void *buffers[], void *cl_arg)
   return 0.0;
 }
 
-char* CPU = "cpu_mult";
-char* GPU = "gpu_mult";
+char* CPU = "cpu_vector_scal";
+char* GPU = "gpu_vector_scal";
 extern char *starpu_find_function(char *name, char *device) {
 	if (!strcmp(device,"gpu")) return GPU;
 	return CPU;

+ 3 - 5
julia/vector_scal/vector_scal.jl

@@ -8,10 +8,8 @@ using LinearAlgebra
     N :: Int32 = length(v)
     # Naive version
     @parallel for i in (1 : N)
-        v[i] = v[i] * k + l + m
+        v[i] = v[i] * m + l + k
     end
-
-    return 0. :: Float32
 end
 
 
@@ -30,13 +28,13 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
         )
         cl = StarpuCodelet(
             cpu_func = CPU_CODELETS["vector_scal"],
-            #cuda_func = "matrix_mult",
+            # cuda_func = CUDA_CODELETS["vector_scal"],
             #opencl_func="ocl_matrix_mult",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )
 
-        for i in (1 : 10)
+        for i in (1 : 1)
             t=time_ns()
             @starpu_sync_tasks begin
                 handles = [hV]