cpu_mult.c 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. #include <stdint.h>
  2. #include <starpu.h>
  3. /*
  4. * The codelet is passed 3 matrices, the "descr" union-type field gives a
  5. * description of the layout of those 3 matrices in the local memory (ie. RAM
  6. * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
  7. * registered data with the "matrix" data interface, we use the matrix macros.
  8. */
  9. void cpu_mult(void *descr[], void *arg)
  10. {
  11. (void)arg;
  12. float *subA, *subB, *subC;
  13. uint32_t nxC, nyC, nyA;
  14. uint32_t ldA, ldB, ldC;
  15. /* .blas.ptr gives a pointer to the first element of the local copy */
  16. subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
  17. subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
  18. subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
  19. /* .blas.nx is the number of rows (consecutive elements) and .blas.ny
  20. * is the number of lines that are separated by .blas.ld elements (ld
  21. * stands for leading dimension).
  22. * NB: in case some filters were used, the leading dimension is not
  23. * guaranteed to be the same in main memory (on the original matrix)
  24. * and on the accelerator! */
  25. nxC = STARPU_MATRIX_GET_NX(descr[2]);
  26. nyC = STARPU_MATRIX_GET_NY(descr[2]);
  27. nyA = STARPU_MATRIX_GET_NY(descr[0]);
  28. ldA = STARPU_MATRIX_GET_LD(descr[0]);
  29. ldB = STARPU_MATRIX_GET_LD(descr[1]);
  30. ldC = STARPU_MATRIX_GET_LD(descr[2]);
  31. /* we assume a FORTRAN-ordering! */
  32. unsigned i,j,k;
  33. for (i = 0; i < nyC; i++)
  34. {
  35. for (j = 0; j < nxC; j++)
  36. {
  37. float sum = 0.0;
  38. for (k = 0; k < nyA; k++)
  39. {
  40. sum += subA[j+k*ldA]*subB[k+i*ldB];
  41. }
  42. subC[j + i*ldC] = sum;
  43. }
  44. }
  45. }