Denis Barthou %!s(int64=5) %!d(string=hai) anos
pai
achega
25f0510f91
Modificáronse 2 ficheiros con 174 adicións e 0 borrados
  1. 90 0
      julia/mult/cpu_mult.c
  2. 84 0
      julia/mult/gpu_mult.cu

+ 90 - 0
julia/mult/cpu_mult.c

@@ -0,0 +1,90 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018                                     Alexis Juven
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <starpu.h>
+/*
+ * The codelet is passed 3 matrices, the "descr" union-type field gives a
+ * description of the layout of those 3 matrices in the local memory (ie. RAM
+ * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
+ * registered data with the "matrix" data interface, we use the matrix macros.
+ */
+void cpu_mult(void *descr[], void *arg)
+{
+	(void)arg;
+	float *subA, *subB, *subC;
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+
+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
+	 * is the number of lines that are separated by .blas.ld elements (ld
+	 * stands for leading dimension).
+	 * NB: in case some filters were used, the leading dimension is not
+	 * guaranteed to be the same in main memory (on the original matrix)
+	 * and on the accelerator! */
+	const uint32_t nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	const uint32_t nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	const uint32_t nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	const uint32_t ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	const uint32_t ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	const uint32_t ldC = STARPU_MATRIX_GET_LD(descr[2]);
+	/* we assume a FORTRAN-ordering! */
+	int i,j,k,ii,jj,kk;
+	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
+	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
+	for (i=0;i<nyC;i+=STRIDE) {
+		for (k=0;k<nyA;k+=STRIDE) {
+			for (j=0;j<nxC;j+=STRIDE) {
+				
+				for (ii = i; ii < i+STRIDE; ii+=2) {
+					float *sC0=subC+ii*ldC+j;
+					float *sC1=subC+ii*ldC+ldC+j;
+					for (kk = k; kk < k+STRIDE; kk+=4) {
+						float alpha00=subB[kk +  ii*ldB];
+						float alpha01=subB[kk+1+ii*ldB];
+						float alpha10=subB[kk+  ii*ldB+ldB];
+						float alpha11=subB[kk+1+ii*ldB+ldB];
+						float alpha02=subB[kk+2+ii*ldB];
+						float alpha03=subB[kk+3+ii*ldB];
+						float alpha12=subB[kk+2+ ii*ldB+ldB];
+						float alpha13=subB[kk+3+ii*ldB+ldB];
+						float *sA0=subA+kk*ldA+j;
+						float *sA1=subA+kk*ldA+ldA+j;
+						float *sA2=subA+kk*ldA+2*ldA+j;
+						float *sA3=subA+kk*ldA+3*ldA+j;
+						for (jj = 0; jj < STRIDE; jj+=1) {
+							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
+							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
+						}
+					}
+				}
+			}
+		}
+	}
+	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
+
+}
+char* CPU = "cpu_mult";
+char* GPU = "gpu_mult";
+extern char *starpu_find_function(char *name, char *device) {
+	if (!strcmp(device,"gpu")) return GPU;
+	return CPU;
+}

+ 84 - 0
julia/mult/gpu_mult.cu

@@ -0,0 +1,84 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018                                     Alexis Juven
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+extern "C" {
+#include <starpu_cuda.h>
+}
+
+#include <stdint.h>
+#include <stdio.h>
+
+
+__global__ void gpuMultKernel
+(
+		uint32_t nxC, uint32_t nyC, uint32_t nyA,
+		uint32_t ldA, uint32_t ldB, uint32_t ldC,
+		float * subA, float * subB, float * subC
+)
+{
+	uint32_t id, i, j, k;
+	float sum;
+
+	id = blockIdx.x * blockDim.x + threadIdx.x;
+	i = id % nxC;
+	j = id / nxC;
+
+	if (j >= nyC){
+		return;
+	}
+
+	sum = 0.;
+
+	for (k = 0 ; k < nyA ; k++){
+		sum += subA[i + k*ldA] * subB[k + j*ldB];
+	}
+
+	subC[i + j*ldC] = sum;
+
+}
+
+
+
+#define THREADS_PER_BLOCK 64
+extern "C" void gpu_mult(void * descr[], void * args)
+{
+
+	float * d_subA, * d_subB, * d_subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	uint32_t nblocks;
+
+	d_subA = (float *) STARPU_MATRIX_GET_PTR(descr[0]);
+	d_subB = (float *) STARPU_MATRIX_GET_PTR(descr[1]);
+	d_subC = (float *) STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
+
+	gpuMultKernel
+		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
+		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+}