shadow_opencl.c 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010 Université de Bordeaux 1
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "stencil.h"
  17. #include <starpu_opencl.h>
  18. /* Perform replication of data on X and Y edges, to fold the domain on
  19. itself through mere replication of the source state. */
  20. #define str(x) #x
  21. #define clsrc(t,k) "__kernel void\n\
  22. #define TYPE " str(t) "\n\
  23. #define K " str(k) "\n\
  24. shadow( int bz, __global TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i)\n\
  25. {\n\
  26. unsigned idx = get_global_id(0);\n\
  27. unsigned idy = get_global_id(1);\n\
  28. //unsigned idz = threadIdx.z + blockIdx.z * blockDim.z;\n\
  29. unsigned idz = 0;\n\
  30. unsigned stepx = get_global_size(0);\n\
  31. unsigned stepy = get_global_size(1);\n\
  32. //unsigned stepz = blockDim.z * gridDim.z;\n\
  33. unsigned stepz = 1;\n\
  34. unsigned x, y, z;\n\
  35. if (idy == 0)\n\
  36. for (z = i-1 + idz; z < nz-(i-1); z += stepz)\n\
  37. for (x = K + idx; x < nx-K; x += stepx) {\n\
  38. unsigned index = x+z*ldz;\n\
  39. ptr[index+(K-1)*ldy] = ptr[index+(ny-K-1)*ldy];\n\
  40. ptr[index+(ny-K)*ldy] = ptr[index+K*ldy];\n\
  41. }\n\
  42. \n\
  43. if (idx == 0)\n\
  44. for (z = i-1 + idz; z < nz-(i-1); z += stepz)\n\
  45. for (y = K + idy; y < ny-K; y += stepy) {\n\
  46. unsigned index = y*ldy+z*ldz;\n\
  47. ptr[(K-1)+index] = ptr[(nx-K-1)+index];\n\
  48. ptr[(nx-K)+index] = ptr[K+index];\n\
  49. }\n\
  50. \n\
  51. if (idx == 0 && idy == 0)\n\
  52. for (z = i-1 + idz; z < nz-(i-1); z += stepz) {\n\
  53. unsigned index = z*ldz;\n\
  54. ptr[K-1+(K-1)*ldy+index] = ptr[(nx-K-1)+(ny-K-1)*ldy+index];\n\
  55. ptr[(nx-K)+(K-1)*ldy+index] = ptr[K+(ny-K-1)*ldy+index];\n\
  56. ptr[(K-1)+(ny-K)*ldy+index] = ptr[(nx-K-1)+K*ldy+index];\n\
  57. ptr[(nx-K)+(ny-K)*ldy+index] = ptr[K+K*ldy+index];\n\
  58. }\n\
  59. }"
  60. static const char * src = clsrc(TYPE,K);
  61. static struct starpu_opencl_program program;
  62. void
  63. opencl_shadow_init(void) {
  64. starpu_opencl_load_opencl_from_string(src, &program);
  65. }
  66. void opencl_shadow_free(void) {
  67. starpu_opencl_unload_opencl(&program);
  68. }
  69. void
  70. opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i)
  71. {
  72. unsigned max_parallelism = 512;
  73. unsigned threads_per_dim_x = max_parallelism;
  74. while (threads_per_dim_x / 2 >= nx)
  75. threads_per_dim_x /= 2;
  76. unsigned threads_per_dim_y = max_parallelism / threads_per_dim_x;
  77. while (threads_per_dim_y / 2 >= ny)
  78. threads_per_dim_y /= 2;
  79. #if 0
  80. unsigned threads_per_dim_z = 4;
  81. size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, threads_per_dim_z};
  82. size_t dimGrid[] = {nx / threads_per_dim_x, ny / threads_per_dim_y, nz / threads_per_dim_z};
  83. #else
  84. size_t dimBlock[] = {threads_per_dim_x, threads_per_dim_y, 1};
  85. size_t dimGrid[] = {((nx + threads_per_dim_x-1) / threads_per_dim_x)*threads_per_dim_x, ((ny + threads_per_dim_y-1) / threads_per_dim_y)*threads_per_dim_y, 1};
  86. #endif
  87. int devid,id;
  88. id = starpu_worker_get_id();
  89. devid = starpu_worker_get_devid(id);
  90. cl_kernel kernel;
  91. cl_command_queue cq;
  92. starpu_opencl_load_kernel(&kernel, &cq, &program, "shadow", devid);
  93. clSetKernelArg(kernel, 0, sizeof(bz), &bz);
  94. clSetKernelArg(kernel, 1, sizeof(ptr), &ptr);
  95. clSetKernelArg(kernel, 2, sizeof(nx), &nx);
  96. clSetKernelArg(kernel, 3, sizeof(ny), &ny);
  97. clSetKernelArg(kernel, 4, sizeof(nz), &nz);
  98. clSetKernelArg(kernel, 5, sizeof(ldy), &ldy);
  99. clSetKernelArg(kernel, 6, sizeof(ldz), &ldz);
  100. clSetKernelArg(kernel, 7, sizeof(i), &i);
  101. cl_event ev;
  102. cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dimGrid, dimBlock, 0, NULL, &ev);
  103. if (err != CL_SUCCESS)
  104. STARPU_OPENCL_REPORT_ERROR(err);
  105. clWaitForEvents(1, &ev);
  106. clReleaseEvent(ev);
  107. }