import Libdl using StarPU using LinearAlgebra @target STARPU_CPU+STARPU_CUDA @codelet function vector_scal(m::Int32, v :: Vector{Float32}, k :: Float32, l :: Float32) :: Float32 N :: Int32 = length(v) # Naive version @parallel for i in (1 : N) v[i] = v[i] * m + l + k end end starpu_init() function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32, l :: Float32) tmin=0 @starpu_block let hV = starpu_data_register(v) tmin=0 perfmodel = StarpuPerfmodel( perf_type = STARPU_HISTORY_BASED, symbol = "history_perf" ) cl = StarpuCodelet( cpu_func = CPU_CODELETS["vector_scal"], # cuda_func = CUDA_CODELETS["vector_scal"], #opencl_func="ocl_matrix_mult", modes = [STARPU_RW], perfmodel = perfmodel ) for i in (1 : 1) t=time_ns() @starpu_sync_tasks begin handles = [hV] task = StarpuTask(cl = cl, handles = handles, cl_arg=(m, k, l)) starpu_task_submit(task) end # @starpu_sync_tasks for task in (1:1) # @starpu_async_cl vector_scal(hV, STARPU_RW, [m, k, l]) # end t=time_ns()-t if (tmin==0 || tmin>t) tmin=t end end end return tmin end function compute_times(io,start_dim, step_dim, stop_dim) for size in (start_dim : step_dim : stop_dim) V = Array(rand(Cfloat, size)) starpu_memory_pin(V) m :: Int32 = 10 k :: Float32 = 2. l :: Float32 = 3. println("INPUT ", V[1:10]) mt = vector_scal_with_starpu(V, m, k, l) starpu_memory_unpin(V) println("OUTPUT ", V[1:10]) println(io,"$size $mt") println("$size $mt") end end io=open(ARGS[1],"w") compute_times(io,1024,1024,4096) close(io) starpu_shutdown()