Sfoglia il codice sorgente

julia/examples/mult: stride is now a program parameter

Nathalie Furmento 5 anni fa
parent
commit
ac0faff53f

+ 7 - 9
julia/examples/mult/Makefile

@@ -1,9 +1,6 @@
-# tile size. Should be changed in mult.jl as well
-STRIDE=72
-
 # ICC compiler
 #CC =icc
-#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
+#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -march=native $(shell pkg-config --cflags starpu-1.3)
 # GCC compiler
 CC=gcc
 NVCC=nvcc
@@ -14,7 +11,7 @@ ifeq ($(ENABLE_CUDA),yes)
         LD := ${NVCC}
 endif
 
-CFLAGS = -O3 -g -DSTRIDE=${STRIDE} $(shell pkg-config --cflags starpu-1.3)
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
 CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
 CUDA_CFLAGS = ${CFLAGS}
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
@@ -51,13 +48,14 @@ clean:
 	rm -f mult *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
+STRIDE=72
 cstarpu.dat: mult
-	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mult > $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mult $(STRIDE) > $@
 julia_generatedc.dat: mult.jl
-	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $(STRIDE) $@
 julia_native.dat: mult_native.jl
-	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult_native.jl $@
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult_native.jl $(STRIDE) $@
 julia_calllib.dat: ${EXTERNLIB} mult.jl
-	JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl julia_calllib.dat
+	JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mult.jl $(STRIDE) julia_calllib.dat
 
 test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 21 - 11
julia/examples/mult/cpu_mult.c

@@ -13,10 +13,12 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
+
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <starpu.h>
+
 /*
  * The codelet is passed 3 matrices, the "descr" union-type field gives a
  * description of the layout of those 3 matrices in the local memory (ie. RAM
@@ -25,14 +27,16 @@
  */
 void cpu_mult(void *descr[], void *arg)
 {
-	(void)arg;
+	int stride;
 	float *subA, *subB, *subC;
+
+	stride = *((int *)arg);
+
 	/* .blas.ptr gives a pointer to the first element of the local copy */
 	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
 
-
 	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
 	 * is the number of lines that are separated by .blas.ld elements (ld
 	 * stands for leading dimension).
@@ -50,14 +54,18 @@ void cpu_mult(void *descr[], void *arg)
 	int i,j,k,ii,jj,kk;
 	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
 	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
-	for (i=0;i<nyC;i+=STRIDE) {
-		for (k=0;k<nyA;k+=STRIDE) {
-			for (j=0;j<nxC;j+=STRIDE) {
-				
-				for (ii = i; ii < i+STRIDE; ii+=2) {
+	for (i=0;i<nyC;i+=stride)
+	{
+		for (k=0;k<nyA;k+=stride)
+		{
+			for (j=0;j<nxC;j+=stride)
+			{
+				for (ii = i; ii < i+stride; ii+=2)
+				{
 					float *sC0=subC+ii*ldC+j;
 					float *sC1=subC+ii*ldC+ldC+j;
-					for (kk = k; kk < k+STRIDE; kk+=4) {
+					for (kk = k; kk < k+stride; kk+=4)
+					{
 						float alpha00=subB[kk +  ii*ldB];
 						float alpha01=subB[kk+1+ii*ldB];
 						float alpha10=subB[kk+  ii*ldB+ldB];
@@ -70,7 +78,8 @@ void cpu_mult(void *descr[], void *arg)
 						float *sA1=subA+kk*ldA+ldA+j;
 						float *sA2=subA+kk*ldA+2*ldA+j;
 						float *sA3=subA+kk*ldA+3*ldA+j;
-						for (jj = 0; jj < STRIDE; jj+=1) {
+						for (jj = 0; jj < stride; jj+=1)
+						{
 							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
 							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
 						}
@@ -80,11 +89,12 @@ void cpu_mult(void *descr[], void *arg)
 		}
 	}
 	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
-
 }
+
 char* CPU = "cpu_mult";
 char* GPU = "gpu_mult";
-extern char *starpu_find_function(char *name, char *device) {
+extern char *starpu_find_function(char *name, char *device)
+{
 	if (!strcmp(device,"gpu")) return GPU;
 	return CPU;
 }

+ 52 - 58
julia/examples/mult/mult.c

@@ -1,10 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2018                                     Alexis Juven
- * Copyright (C) 2012,2013                                Inria
- * Copyright (C) 2009-2011,2013-2015                      Université de Bordeaux
- * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2018       Alexis Juven
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -40,8 +37,6 @@
 
 #include <starpu.h>
 
-
-
 /*
  * That program should compute C = A * B
  *
@@ -63,43 +58,32 @@
 
  */
 
-
-
-
-
 //void gpu_mult(void **, void *);
 void cpu_mult(void **, void *);
 
-
 static struct starpu_perfmodel model =
 {
-		.type = STARPU_HISTORY_BASED,
-		.symbol = "history_perf"
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "history_perf"
 };
 
 static struct starpu_codelet cl =
 {
-		.cpu_funcs = {cpu_mult},
-		.cpu_funcs_name = {"cpu_mult"},
-		//.cuda_funcs = {gpu_mult},
-		.nbuffers = 3,
-		.modes = {STARPU_R, STARPU_R, STARPU_W},
-		.model = &model
+	.cpu_funcs = {cpu_mult},
+	.cpu_funcs_name = {"cpu_mult"},
+	//.cuda_funcs = {gpu_mult},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_W},
+	.model = &model
 };
 
-
-void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigned ydim,  unsigned zdim, unsigned nslicesx, unsigned nslicesy)
+void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigned ydim,  unsigned zdim, unsigned nslicesx, unsigned nslicesy, int stride)
 {
 	starpu_data_handle_t A_handle, B_handle, C_handle;
 
-
-	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
-			ydim, ydim, zdim, sizeof(float));
-	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
-			zdim, zdim, xdim, sizeof(float));
-	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
-			ydim, ydim, xdim, sizeof(float));
-
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, ydim, ydim, zdim, sizeof(float));
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B, zdim, zdim, xdim, sizeof(float));
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C, ydim, ydim, xdim, sizeof(float));
 
 	struct starpu_data_filter vert =
 	{
@@ -113,31 +97,31 @@ void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigne
 			.nchildren = nslicesy
 	};
 
-
 	starpu_data_partition(B_handle, &vert);
 	starpu_data_partition(A_handle, &horiz);
 	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
 
 	unsigned taskx, tasky;
 
-	for (taskx = 0; taskx < nslicesx; taskx++){
-		for (tasky = 0; tasky < nslicesy; tasky++){
-
+	for (taskx = 0; taskx < nslicesx; taskx++)
+	{
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
 			struct starpu_task *task = starpu_task_create();
 
 			task->cl = &cl;
 			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
 			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
 			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
+			task->cl_arg = &stride;
+			task->cl_arg_size = sizeof(stride);
 
 			if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
-
 		}
 	}
 
 	starpu_task_wait_for_all();
 
-
 	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
 	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
 	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
@@ -145,31 +129,27 @@ void multiply_with_starpu(float *A, float *B, float *C,  unsigned xdim,  unsigne
 	starpu_data_unregister(A_handle);
 	starpu_data_unregister(B_handle);
 	starpu_data_unregister(C_handle);
-
 }
 
-
-
 void init_rand(float * m, unsigned width, unsigned height)
 {
 	unsigned i,j;
 
-	for (j = 0 ; j < height ; j++){
-		for (i = 0 ; i < width ; i++){
+	for (j = 0 ; j < height ; j++)
+	{
+		for (i = 0 ; i < width ; i++)
+		{
 			m[j+i*height] = (float)(starpu_drand48());
 		}
 	}
 }
 
-
 void init_zero(float * m, unsigned width, unsigned height)
 {
 	memset(m, 0, sizeof(float) * width * height);
 }
 
-
-
-double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, unsigned nsclicesx, unsigned nsclicesy)
+double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, unsigned nsclicesx, unsigned nsclicesy, int stride)
 {
 	unsigned i;
 
@@ -179,8 +159,8 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 
 	double exec_times=-1;
 
-	for (i = 0 ; i < nb_test ; i++){
-
+	for (i = 0 ; i < nb_test ; i++)
+	{
 		double start, stop, exec_t;
 
 		init_rand(A, zdim, ydim);
@@ -188,7 +168,7 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 		init_zero(C, xdim, ydim);
 
 		start = starpu_timing_now();
-		multiply_with_starpu(A, B, C, xdim, ydim, zdim, nsclicesx, nsclicesy);
+		multiply_with_starpu(A, B, C, xdim, ydim, zdim, nsclicesx, nsclicesy, stride);
 		stop = starpu_timing_now();
 
 		exec_t = (stop - start)*1.e3; // Put in ns instead of us
@@ -201,34 +181,48 @@ double min_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim, u
 	return exec_times;
 }
 
-
-void display_times(unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nb_tests, unsigned nsclicesx, unsigned nsclicesy)
+void display_times(unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nb_tests, unsigned nsclicesx, unsigned nsclicesy, int stride)
 {
 	unsigned dim;
 
-	for (dim = start_dim ; dim <= stop_dim ; dim += step_dim){
-		double t = min_time(nb_tests, dim, dim, dim, nsclicesx, nsclicesy);
+	for (dim = start_dim ; dim <= stop_dim ; dim += step_dim)
+	{
+		double t = min_time(nb_tests, dim, dim, dim, nsclicesx, nsclicesy, stride);
 		printf("%f %f\n", dim*dim*4.*3./1024./1024, (2.*dim-1.)*dim*dim/t);
 	}
-
 }
 
+#ifdef STARPU_QUICK_CHECK
+#define STRIDE_DEFAULT 4
+#else
+#define STRIDE_DEFAULT 72
+#endif
 
 int main(int argc, char * argv[])
 {
-	if (starpu_init(NULL) != EXIT_SUCCESS){
+	int stride=STRIDE_DEFAULT;
+	if (argc >= 2)
+		stride = atoi(argv[1]);
+	if (stride % 4 != 0)
+	{
+		fprintf(stderr, "STRIDE must be a multiple of 4 (%d)\n", stride);
+		return -1;
+	}
+
+	if (starpu_init(NULL) != EXIT_SUCCESS)
+	{
 		fprintf(stderr, "ERROR\n");
 		return 77;
 	}
 
-	unsigned start_dim = 16*STRIDE;
-	unsigned step_dim = 4*STRIDE;
-	unsigned stop_dim = 4096;
+	unsigned start_dim = 16*stride;
+	unsigned step_dim = 4*stride;
+	unsigned stop_dim = 128*stride;
 	unsigned nb_tests = 10;
 	unsigned nsclicesx = 2;
 	unsigned nsclicesy = 2;
 
-	display_times(start_dim, step_dim, stop_dim, nb_tests, nsclicesx, nsclicesy);
+	display_times(start_dim, step_dim, stop_dim, nb_tests, nsclicesx, nsclicesy, stride);
 
 	starpu_shutdown();
 

+ 13 - 10
julia/examples/mult/mult.jl

@@ -2,9 +2,6 @@ import Libdl
 using StarPU
 using LinearAlgebra
 
-#shoud be the same as in the makefile
-const STRIDE = 72
-
 @target STARPU_CPU+STARPU_CUDA
 @codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
 
@@ -59,7 +56,7 @@ end
 
 starpu_init()
 
-function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy, stride)
     scale= 3
     tmin=0
     vert = StarpuDataFilter(STARPU_MATRIX_FILTER_VERTICAL_BLOCK, nslicesx)
@@ -88,7 +85,7 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
                 for taskx in (1 : nslicesx)
                     for tasky in (1 : nslicesy)
                         handles = [hA[tasky], hB[taskx], hC[taskx, tasky]]
-                        task = StarpuTask(cl = cl, handles = handles)
+                        task = StarpuTask(cl = cl, handles = handles, cl_arg=(stride))
                         starpu_task_submit(task)
                         #@starpu_async_cl matrix_mult(hA[tasky], hB[taskx], hC[taskx, tasky])
                     end
@@ -123,12 +120,12 @@ function approximately_equals(
     return true
 end
 
-function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
     for dim in (start_dim : step_dim : stop_dim)
         A = Array(rand(Cfloat, dim, dim))
         B = Array(rand(Cfloat, dim, dim))
         C = zeros(Float32, dim, dim)
-        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy)
+        mt =  multiply_with_starpu(A, B, C, nslicesx, nslicesy, stride)
         flops = (2*dim-1)*dim*dim/mt
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
@@ -136,9 +133,15 @@ function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
     end
 end
 
-
-io=open(ARGS[1],"w")
-compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
+if size(ARGS, 1) < 2
+    stride=4
+    filename="x.dat"
+else
+    stride=parse(Int, ARGS[1])
+    filename=ARGS[2]
+end
+io=open(filename,"w")
+compute_times(io,16*stride,4*stride,128*stride,2,2,stride)
 close(io)
 
 starpu_shutdown()

+ 12 - 9
julia/examples/mult/mult_native.jl

@@ -2,10 +2,7 @@ import Libdl
 using StarPU
 using LinearAlgebra
 
-#shoud be the same as in the makefile
-const STRIDE = 72
-
-function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
+function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy, stride)
     tmin = 0
     for i in (1 : 10 )
         t=time_ns()
@@ -19,12 +16,12 @@ function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :
 end
 
 
-function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
+function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy, stride)
     for dim in (start_dim : step_dim : stop_dim)
         A = Array(rand(Cfloat, dim, dim))
         B = Array(rand(Cfloat, dim, dim))
         C = zeros(Float32, dim, dim)
-        mt =  multiply_without_starpu(A, B, C, nslicesx, nslicesy)
+        mt =  multiply_without_starpu(A, B, C, nslicesx, nslicesy, stride)
         flops = (2*dim-1)*dim*dim/mt
         size=dim*dim*4*3/1024/1024
         println(io,"$size $flops")
@@ -32,8 +29,14 @@ function compute_times(io,start_dim, step_dim, stop_dim, nslicesx, nslicesy)
     end
 end
 
-
-io=open(ARGS[1],"w")
-compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
+if size(ARGS, 1) < 2
+    stride=4
+    filename="x.dat"
+else
+    stride=parse(Int, ARGS[1])
+    filename=ARGS[2]
+end
+io=open(filename,"w")
+compute_times(io,16*stride,4*stride,128*stride,2,2,stride)
 close(io)