cpu_mult.c 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2018 Alexis Juven
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <stdint.h>
  17. #include <stdio.h>
  18. #include <string.h>
  19. #include <starpu.h>
  20. /*
  21. * The codelet is passed 3 matrices, the "descr" union-type field gives a
  22. * description of the layout of those 3 matrices in the local memory (ie. RAM
  23. * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
  24. * registered data with the "matrix" data interface, we use the matrix macros.
  25. */
  26. void cpu_mult(void *descr[], void *arg)
  27. {
  28. (void)arg;
  29. float *subA, *subB, *subC;
  30. /* .blas.ptr gives a pointer to the first element of the local copy */
  31. subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
  32. subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
  33. subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
  34. /* .blas.nx is the number of rows (consecutive elements) and .blas.ny
  35. * is the number of lines that are separated by .blas.ld elements (ld
  36. * stands for leading dimension).
  37. * NB: in case some filters were used, the leading dimension is not
  38. * guaranteed to be the same in main memory (on the original matrix)
  39. * and on the accelerator! */
  40. const uint32_t nxC = STARPU_MATRIX_GET_NX(descr[2]);
  41. const uint32_t nyC = STARPU_MATRIX_GET_NY(descr[2]);
  42. const uint32_t nyA = STARPU_MATRIX_GET_NY(descr[0]);
  43. const uint32_t ldA = STARPU_MATRIX_GET_LD(descr[0]);
  44. const uint32_t ldB = STARPU_MATRIX_GET_LD(descr[1]);
  45. const uint32_t ldC = STARPU_MATRIX_GET_LD(descr[2]);
  46. /* we assume a FORTRAN-ordering! */
  47. int i,j,k,ii,jj,kk;
  48. for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
  49. //fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
  50. for (i=0;i<nyC;i+=STRIDE) {
  51. for (k=0;k<nyA;k+=STRIDE) {
  52. for (j=0;j<nxC;j+=STRIDE) {
  53. for (ii = i; ii < i+STRIDE; ii+=2) {
  54. float *sC0=subC+ii*ldC+j;
  55. float *sC1=subC+ii*ldC+ldC+j;
  56. for (kk = k; kk < k+STRIDE; kk+=4) {
  57. float alpha00=subB[kk + ii*ldB];
  58. float alpha01=subB[kk+1+ii*ldB];
  59. float alpha10=subB[kk+ ii*ldB+ldB];
  60. float alpha11=subB[kk+1+ii*ldB+ldB];
  61. float alpha02=subB[kk+2+ii*ldB];
  62. float alpha03=subB[kk+3+ii*ldB];
  63. float alpha12=subB[kk+2+ ii*ldB+ldB];
  64. float alpha13=subB[kk+3+ii*ldB+ldB];
  65. float *sA0=subA+kk*ldA+j;
  66. float *sA1=subA+kk*ldA+ldA+j;
  67. float *sA2=subA+kk*ldA+2*ldA+j;
  68. float *sA3=subA+kk*ldA+3*ldA+j;
  69. for (jj = 0; jj < STRIDE; jj+=1) {
  70. sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
  71. sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
  72. }
  73. }
  74. }
  75. }
  76. }
  77. }
  78. //fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
  79. }
  80. char* CPU = "cpu_mult";
  81. char* GPU = "gpu_mult";
  82. extern char *starpu_find_function(char *name, char *device) {
  83. if (!strcmp(device,"gpu")) return GPU;
  84. return CPU;
  85. }