axpy.jl 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. # StarPU --- Runtime system for heterogeneous multicore architectures.
  2. #
  3. # Copyright (C) 2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. #
  5. # StarPU is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU Lesser General Public License as published by
  7. # the Free Software Foundation; either version 2.1 of the License, or (at
  8. # your option) any later version.
  9. #
  10. # StarPU is distributed in the hope that it will be useful, but
  11. # WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. #
  14. # See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. #
  16. using StarPU
  17. using Printf
  18. const EPSILON = 1e-6
  19. function check(alpha, X, Y)
  20. for i in 1:length(X)
  21. expected_value = alpha * X[i] + 4.0
  22. if abs(Y[i] - expected_value) > expected_value * EPSILON
  23. error("at ", i, ", ", alpha, "*", X[i], "+4.0=", Y[i], ", expected ", expected_value)
  24. end
  25. end
  26. end
  27. @target STARPU_CPU+STARPU_CUDA
  28. @codelet function axpy(X :: Vector{Float32}, Y :: Vector{Float32}, alpha ::Float32) :: Nothing
  29. STARPU_SAXPY(length(X), alpha, X, 1, Y, 1)
  30. return
  31. end
  32. function axpy(N, NBLOCKS, alpha, display = true)
  33. X = Array(fill(1.0f0, N))
  34. Y = Array(fill(4.0f0, N))
  35. starpu_memory_pin(X)
  36. starpu_memory_pin(Y)
  37. block_filter = starpu_data_filter(STARPU_VECTOR_FILTER_BLOCK, NBLOCKS)
  38. perfmodel = starpu_perfmodel(
  39. perf_type = starpu_perfmodel_type(STARPU_HISTORY_BASED),
  40. symbol = "history_perf"
  41. )
  42. cl = starpu_codelet(
  43. cpu_func = "axpy",
  44. cuda_func = "axpy",
  45. #cuda_func = STARPU_SAXPY,
  46. modes = [STARPU_R, STARPU_RW],
  47. perfmodel = perfmodel
  48. )
  49. if display
  50. println("BEFORE x[0] = ", X[1])
  51. println("BEFORE y[0] = ", Y[1])
  52. end
  53. t_start = time_ns()
  54. @starpu_block let
  55. hX,hY = starpu_data_register(X, Y)
  56. starpu_data_partition(hX, block_filter)
  57. starpu_data_partition(hY, block_filter)
  58. for b in 1:NBLOCKS
  59. task = starpu_task(cl = cl, handles = [hX[b],hY[b]], cl_arg=(Float32(alpha),),
  60. tag=starpu_tag_t(b))
  61. starpu_task_submit(task)
  62. end
  63. starpu_task_wait_for_all()
  64. end
  65. t_end = time_ns()
  66. timing = (t_end-t_start)/1000
  67. if display
  68. @printf("timing -> %d us %.2f MB/s\n", timing, 3*N*4/timing)
  69. println("AFTER y[0] = ", Y[1], " (ALPHA=", alpha, ")")
  70. end
  71. check(alpha, X, Y)
  72. starpu_memory_unpin(X)
  73. starpu_memory_unpin(Y)
  74. end
  75. function main()
  76. N = 16 * 1024 * 1024
  77. NBLOCKS = 8
  78. alpha = 3.41
  79. starpu_init()
  80. starpu_cublas_init()
  81. # warmup
  82. axpy(10, 1, alpha, false)
  83. axpy(N, NBLOCKS, alpha)
  84. starpu_shutdown()
  85. end
  86. main()