123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- #include <stdint.h>
- #include <stdio.h>
- #include <string.h>
- #include <starpu.h>
- void cpu_mult(void *descr[], void *cl_arg)
- {
- int stride;
- float *subA, *subB, *subC;
- stride = *((int *)cl_arg);
-
- subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
- subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
- subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
-
- const uint32_t nxC = STARPU_MATRIX_GET_NX(descr[2]);
- const uint32_t nyC = STARPU_MATRIX_GET_NY(descr[2]);
- const uint32_t nyA = STARPU_MATRIX_GET_NY(descr[0]);
- const uint32_t ldA = STARPU_MATRIX_GET_LD(descr[0]);
- const uint32_t ldB = STARPU_MATRIX_GET_LD(descr[1]);
- const uint32_t ldC = STARPU_MATRIX_GET_LD(descr[2]);
-
- int i,j,k,ii,jj,kk;
- for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
-
- for (i=0;i<nyC;i+=stride)
- {
- for (k=0;k<nyA;k+=stride)
- {
- for (j=0;j<nxC;j+=stride)
- {
- for (ii = i; ii < i+stride; ii+=2)
- {
- float *sC0=subC+ii*ldC+j;
- float *sC1=subC+ii*ldC+ldC+j;
- for (kk = k; kk < k+stride; kk+=4)
- {
- float alpha00=subB[kk + ii*ldB];
- float alpha01=subB[kk+1+ii*ldB];
- float alpha10=subB[kk+ ii*ldB+ldB];
- float alpha11=subB[kk+1+ii*ldB+ldB];
- float alpha02=subB[kk+2+ii*ldB];
- float alpha03=subB[kk+3+ii*ldB];
- float alpha12=subB[kk+2+ ii*ldB+ldB];
- float alpha13=subB[kk+3+ii*ldB+ldB];
- float *sA0=subA+kk*ldA+j;
- float *sA1=subA+kk*ldA+ldA+j;
- float *sA2=subA+kk*ldA+2*ldA+j;
- float *sA3=subA+kk*ldA+3*ldA+j;
- for (jj = 0; jj < stride; jj+=1)
- {
- sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
- sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
- }
- }
- }
- }
- }
- }
-
- }
- char* CPU = "cpu_mult";
- char* GPU = "";
- extern char *starpu_find_function(char *name, char *device)
- {
- if (!strcmp(device,"gpu")) return GPU;
- return CPU;
- }
|