5 years ago · 25f0510f91
--- a/julia/mult/cpu_mult.c
+++ b/julia/mult/cpu_mult.c
@@ -0,0 +1,90 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2018                                     Alexis Juven
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <stdint.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <starpu.h>
			
 
				+/*
			
 
				+ * The codelet is passed 3 matrices, the "descr" union-type field gives a
			
 
				+ * description of the layout of those 3 matrices in the local memory (ie. RAM
			
 
				+ * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
			
 
				+ * registered data with the "matrix" data interface, we use the matrix macros.
			
 
				+ */
			
 
				+void cpu_mult(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	float *subA, *subB, *subC;
			
 
				+	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+
			
 
				+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
			
 
				+	 * is the number of lines that are separated by .blas.ld elements (ld
			
 
				+	 * stands for leading dimension).
			
 
				+	 * NB: in case some filters were used, the leading dimension is not
			
 
				+	 * guaranteed to be the same in main memory (on the original matrix)
			
 
				+	 * and on the accelerator! */
			
 
				+	const uint32_t nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	const uint32_t nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	const uint32_t nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	const uint32_t ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	const uint32_t ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	const uint32_t ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+	/* we assume a FORTRAN-ordering! */
			
 
				+	int i,j,k,ii,jj,kk;
			
 
				+	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
			
 
				+	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
			
 
				+	for (i=0;i<nyC;i+=STRIDE) {
			
 
				+		for (k=0;k<nyA;k+=STRIDE) {
			
 
				+			for (j=0;j<nxC;j+=STRIDE) {
			
 
				+				
			
 
				+				for (ii = i; ii < i+STRIDE; ii+=2) {
			
 
				+					float *sC0=subC+ii*ldC+j;
			
 
				+					float *sC1=subC+ii*ldC+ldC+j;
			
 
				+					for (kk = k; kk < k+STRIDE; kk+=4) {
			
 
				+						float alpha00=subB[kk +  ii*ldB];
			
 
				+						float alpha01=subB[kk+1+ii*ldB];
			
 
				+						float alpha10=subB[kk+  ii*ldB+ldB];
			
 
				+						float alpha11=subB[kk+1+ii*ldB+ldB];
			
 
				+						float alpha02=subB[kk+2+ii*ldB];
			
 
				+						float alpha03=subB[kk+3+ii*ldB];
			
 
				+						float alpha12=subB[kk+2+ ii*ldB+ldB];
			
 
				+						float alpha13=subB[kk+3+ii*ldB+ldB];
			
 
				+						float *sA0=subA+kk*ldA+j;
			
 
				+						float *sA1=subA+kk*ldA+ldA+j;
			
 
				+						float *sA2=subA+kk*ldA+2*ldA+j;
			
 
				+						float *sA3=subA+kk*ldA+3*ldA+j;
			
 
				+						for (jj = 0; jj < STRIDE; jj+=1) {
			
 
				+							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
			
 
				+							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
			
 
				+
			
 
				+}
			
 
				+char* CPU = "cpu_mult";
			
 
				+char* GPU = "gpu_mult";
			
 
				+extern char *starpu_find_function(char *name, char *device) {
			
 
				+	if (!strcmp(device,"gpu")) return GPU;
			
 
				+	return CPU;
			
 
				+}
			
--- a/julia/mult/gpu_mult.cu
+++ b/julia/mult/gpu_mult.cu
@@ -0,0 +1,84 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2018                                     Alexis Juven
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+extern "C" {
			
 
				+#include <starpu_cuda.h>
			
 
				+}
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+
			
 
				+__global__ void gpuMultKernel
			
 
				+(
			
 
				+		uint32_t nxC, uint32_t nyC, uint32_t nyA,
			
 
				+		uint32_t ldA, uint32_t ldB, uint32_t ldC,
			
 
				+		float * subA, float * subB, float * subC
			
 
				+)
			
 
				+{
			
 
				+	uint32_t id, i, j, k;
			
 
				+	float sum;
			
 
				+
			
 
				+	id = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+	i = id % nxC;
			
 
				+	j = id / nxC;
			
 
				+
			
 
				+	if (j >= nyC){
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	sum = 0.;
			
 
				+
			
 
				+	for (k = 0 ; k < nyA ; k++){
			
 
				+		sum += subA[i + k*ldA] * subB[k + j*ldB];
			
 
				+	}
			
 
				+
			
 
				+	subC[i + j*ldC] = sum;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define THREADS_PER_BLOCK 64
			
 
				+extern "C" void gpu_mult(void * descr[], void * args)
			
 
				+{
			
 
				+
			
 
				+	float * d_subA, * d_subB, * d_subC;
			
 
				+	uint32_t nxC, nyC, nyA;
			
 
				+	uint32_t ldA, ldB, ldC;
			
 
				+	uint32_t nblocks;
			
 
				+
			
 
				+	d_subA = (float *) STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	d_subB = (float *) STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	d_subC = (float *) STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
			
 
				+
			
 
				+	gpuMultKernel
			
 
				+		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
			
 
				+		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+}