vs_cuda_kernel.cu 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2017,2018 Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. /* This example shows a basic StarPU vector scale app on top of StarPURM with a nVidia CUDA kernel */
  17. #include <starpu.h>
  18. #include <starpurm.h>
  19. static __global__ void vector_scale_cuda_kernel(float *vector, unsigned n, float scalar)
  20. {
  21. unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
  22. if (i < n)
  23. {
  24. vector[i] *= scalar;
  25. }
  26. }
  27. extern "C" void vector_scale_cuda_func(void *cl_buffers[], void *cl_arg)
  28. {
  29. float scalar = -1.0;
  30. unsigned n = STARPU_VECTOR_GET_NX(cl_buffers[0]);
  31. float *vector = (float *)STARPU_VECTOR_GET_PTR(cl_buffers[0]);
  32. starpu_codelet_unpack_args(cl_arg, &scalar);
  33. {
  34. int workerid = starpu_worker_get_id();
  35. hwloc_cpuset_t worker_cpuset = starpu_worker_get_hwloc_cpuset(workerid);
  36. hwloc_cpuset_t check_cpuset = starpurm_get_selected_cpuset();
  37. #if 0
  38. {
  39. int strl1 = hwloc_bitmap_snprintf(NULL, 0, worker_cpuset);
  40. char str1[strl1+1];
  41. hwloc_bitmap_snprintf(str1, strl1+1, worker_cpuset);
  42. int strl2 = hwloc_bitmap_snprintf(NULL, 0, check_cpuset);
  43. char str2[strl2+1];
  44. hwloc_bitmap_snprintf(str2, strl2+1, check_cpuset);
  45. printf("worker[%03d] - task: vector=%p, n=%d, scalar=%lf, worker cpuset = %s, selected cpuset = %s\n", workerid, vector, n, scalar, str1, str2);
  46. }
  47. #endif
  48. hwloc_bitmap_and(check_cpuset, check_cpuset, worker_cpuset);
  49. assert(!hwloc_bitmap_iszero(check_cpuset));
  50. hwloc_bitmap_free(check_cpuset);
  51. hwloc_bitmap_free(worker_cpuset);
  52. }
  53. unsigned nb_threads_per_block = 64;
  54. unsigned nb_blocks = (n + nb_threads_per_block-1) / nb_threads_per_block;
  55. vector_scale_cuda_kernel<<<nb_blocks,nb_threads_per_block,0,starpu_cuda_get_local_stream()>>>(vector, n, scalar);
  56. }