Преглед изворни кода

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into ft_checkpoint

Romain LION пре 5 година
родитељ
комит
e54eb2914d

+ 16 - 2
julia/StarPU.jl/src/StarPU.jl

@@ -583,6 +583,7 @@ function starpu_init()
             print(k,">>>>",CPU_CODELETS[k],"\n")
         end
     else
+        @debugprint "generating codelet library"
         system("make generated_tasks.dylib")
         global starpu_tasks_library_handle=Libdl.dlopen("generated_tasks")
     end
@@ -798,15 +799,28 @@ function starpu_task_submit(task :: StarpuTask)
     @starpucall starpu_task_submit Cint (Ptr{Cvoid},) task.c_task
 end
 
+
+function starpu_modes(x :: Symbol)
+    if (x == Symbol("STARPU_RW"))
+        return STARPU_RW
+    elseif (x == Symbol("STARPU_R"))
+        return STARPU_R
+    else return STARPU_W
+    end
+end
+
 """
     Creates and submits an asynchronous task running cl Codelet function.
     Ex : @starpu_async_cl cl(handle1, handle2)
 """
-macro starpu_async_cl(expr)
+macro starpu_async_cl(expr,modes)
 
     if (!isa(expr, Expr) || expr.head != :call)
         error("Invalid task submit syntax")
     end
+    if (!isa(expr, Expr)||modes.head != :vect)
+        error("Invalid task submit syntax")
+    end
     perfmodel = StarpuPerfmodel(
         perf_type = STARPU_HISTORY_BASED,
         symbol = "history_perf"
@@ -817,7 +831,7 @@ macro starpu_async_cl(expr)
         #cuda_func = "matrix_mult",
         #opencl_func="ocl_matrix_mult",
         ### TODO: CORRECT !
-        modes = [STARPU_R, STARPU_R, STARPU_W],
+        modes = map((x -> starpu_modes(x)),modes.args),
         perfmodel = perfmodel
     )
     handles = Expr(:vect, expr.args[2:end]...)

+ 22 - 0
julia/StarPU.jl/src/compiler/expressions.jl

@@ -93,6 +93,8 @@ end
 struct StarpuExprReturn <: StarpuExpr
     value :: StarpuExpr
 end
+struct StarpuExprBreak <: StarpuExpr
+end
 struct StarpuExprVar <: StarpuExpr
     name :: Symbol
 end
@@ -717,6 +719,26 @@ function apply(func :: Function, expr :: StarpuExprRef)
 end
 
 #======================================================
+                BREAK EXPRESSION
+======================================================#
+
+function starpu_parse_break(x :: Expr)
+    if (x.head != :break)
+        error("Invalid \"break\" expression")
+    end
+
+    return StarpuExprBreak()
+end
+
+function print(io :: IO, x :: StarpuExprBreak ; indent = 0)
+    print(io, "break")
+end
+
+function apply(func :: Function, expr :: StarpuExprBreak)
+
+    return func(StarpuExprBreak())
+end
+#======================================================
                 RETURN EXPRESSION
 ======================================================#
 

+ 2 - 0
julia/StarPU.jl/src/compiler/file_generation.jl

@@ -10,6 +10,7 @@ global generated_cpu_kernel_file_name = "PRINT TO STDOUT"
 const cpu_kernel_file_start = "#include <stdio.h>
 #include <stdint.h>
 #include <starpu.h>
+#include <math.h>
 
 static inline long long jlstarpu_max(long long a, long long b)
 {
@@ -30,6 +31,7 @@ static inline long long jlstarpu_interval_size(long long start, long long step,
 const cuda_kernel_file_start = "#include <stdio.h>
 #include <stdint.h>
 #include <starpu.h>
+#include <math.h>
 
 #define THREADS_PER_BLOCK 64
 

+ 1 - 1
julia/StarPU.jl/src/compiler/parsing.jl

@@ -32,7 +32,7 @@ function starpu_parse(x :: Expr)
 
 end
 
-for kw in (:if, :call, :for, :block, :return, :function, :while, :ref)
+for kw in (:if, :call, :for, :block, :return, :function, :while, :ref, :break)
     starpu_parse_key_word_parsing_function[kw] = eval(Symbol(:starpu_parse_, kw))
 end
 

+ 10 - 15
julia/mandelbrot/cpu_mandelbrot.c

@@ -17,11 +17,10 @@ void cpu_mandelbrot(void *descr[], void *cl_arg)
 
         float centerr = params[0];
         float centeri = params[1];
-
         float offset = params[2];
         float dim = params[3];
         float zoom = width * 0.25296875;
-        float conv_limit = 2.0;
+        float diverge = 4.0;
         int max_iter = (width/2) * 0.049715909 * log10(zoom);
 
         int x,y,n;
@@ -32,27 +31,23 @@ void cpu_mandelbrot(void *descr[], void *cl_arg)
                         float ci = centeri + (y+offset - (dim/2))/zoom;
                         float zr = cr;
                         float zi = ci;
-                        float m = zr * zr + zi * zi;
                         
-                        for (n = 0; n <= max_iter && m < conv_limit * conv_limit; n++) {
-
+                        for (n = 0; n <= max_iter; n++) {
+				if (zr*zr + zi*zi>diverge) break;
                                 float tmp = zr*zr - zi*zi + cr;
                                 zi = 2*zr*zi + ci;
                                 zr = tmp;
-                                m = zr*zr + zi*zi;
                         }
-
+			int color;
+			if (n<max_iter)
+				color = round(15.*n/max_iter);
+			else
+				color = 0;
+			pixels[x*ldP + y] = color;
 		}
-		int color;
-		if (n==max_iter) fprintf(stderr,".");
-		else fprintf(stderr,"%d",n);
-		if (n<max_iter)
-			color = round(15.*n/max_iter);
-		else
-			color = 0;
-		pixels[x*ldP + y] = color;
 	}
 }
+
 char* CPU = "cpu_mandelbrot";
 char* GPU = "gpu_mandelbrot";
 extern char *starpu_find_function(char *name, char *device) {

+ 99 - 15
julia/mandelbrot/mandelbrot.jl

@@ -1,30 +1,114 @@
-function mandelbrotjl(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64)
-    height,width = size(pixels)
-    zoom = width * 0.25296875
-    val_diverge = 2.0
-    max_iterations = (width/2) * 0.049715909 * log10(zoom);
+import Libdl
+using StarPU
+using LinearAlgebra
 
-
-    for y = 1:height
+@target STARPU_CPU+STARPU_CUDA
+@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Float32
+    height :: Int64 = height(pixels)
+    width :: Int64 = width(pixels)
+    zoom :: Float64 = width * 0.25296875
+    iz :: Float64 = 1. / zoom
+    diverge :: Float32 = 4.0
+    max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
+    imi :: Float32 = 1. / max_iterations
+    centerr :: Float32 = params[1,1]
+    centeri :: Float32 = params[2,1]
+    offset :: Float32 = params[3,1]
+    dim :: Float32 = params[4,1]
+    cr :: Float64 = 0.
+    zr :: Float64 = 0.
+    ci :: Float64 = 0.
+    zi :: Float64 = 0.
+    n :: Int64 = 0
+    tmp :: Float64 = 0.
+    @parallel for y = 1:height
         for x = 1:width
-            cr = centerr + (x - (width / 2))/zoom
+            cr = centerr + (x-1 - (dim / 2)) * iz
             zr = cr
-            ci = centeri + (y - (height / 2))/zoom
+            ci = centeri + (y-1+offset - (dim / 2)) * iz
             zi = ci
-
-            n = 0
-            while ((n < max_iterations) && (zr*zr + zi*zi < val_diverge*val_diverge))
+            for n = 0:max_iterations
+                if (zr*zr + zi*zi > diverge)
+                    break
+                end
                 tmp = zr*zr - zi*zi + cr
                 zi = 2*zr*zi + ci
                 zr = tmp
-                n = n+1
             end
             
             if (n < max_iterations)
-                pixels[y,x] = round(255 * n / max_iterations)
+                pixels[y,x] = round(15 * n * imi)
             else
                 pixels[y,x] = 0
             end
         end
     end
-end
+    return 0. :: Float32
+end
+
+@debugprint "starpu_init"
+starpu_init()
+
+function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
+    horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
+    @starpu_block let
+	hA, hP = starpu_data_register(A,params)
+	starpu_data_partition(hA,horiz)
+        starpu_data_partition(hP,horiz)
+        
+	@starpu_sync_tasks for taskx in (1 : nslicesx)
+                @starpu_async_cl mandelbrot(hA[taskx], hP[taskx]) [STARPU_W, STARPU_R]
+	end
+    end
+end
+
+function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
+    MAPPING = [[66,30,15],[25,7,26],[9,1,47],[4,4,73],[0,7,100],[12,44,138],[24,82,177],[57,125,209],[134,181,229],[211,236,248],[241,233,191],[248,201,95],[255,170,0],[204,128,0],[153,87,0],[106,52,3]]
+    open(filename, "w") do f
+        write(f, "P3\n$width $height\n255\n")
+        for i = 1:height
+            for j = 1:width
+                write(f,"$(MAPPING[1+pixels[i,j]][1]) $(MAPPING[1+pixels[i,j]][2]) $(MAPPING[1+pixels[i,j]][3]) ")
+            end
+            write(f, "\n")
+        end
+    end
+end
+
+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
+    tmin=0;
+    
+    pixels ::Matrix{Int64} = zeros(dim, dim)
+    params :: Matrix{Float32} = zeros(4*nslices,1)
+    for i=0:(nslices-1)
+        params[4*i+1,1] = cr
+        params[4*i+2,1] = ci
+        params[4*i+3,1] = i*dim/nslices
+        params[4*i+4,1] = dim
+    end
+    for i = 1:10
+        t = time_ns();
+        mandelbrot_with_starpu(pixels, params, nslices)
+        t = time_ns()-t
+        if (tmin==0 || tmin>t)
+            tmin=t
+        end
+    end
+    pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    return tmin
+end
+
+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
+    for dim in (start_dim : step_dim : stop_dim)
+        res = min_times(cr, ci, dim, nslices)
+        res=res/dim/dim; # time per pixel
+        println("$(dim) $(res)")
+    end
+end
+
+
+display_time(-0.800671,-0.158392,32,32,4096,4)
+
+@debugprint "starpu_shutdown"
+starpu_shutdown()
+

+ 90 - 0
julia/mult/cpu_mult.c

@@ -0,0 +1,90 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018                                     Alexis Juven
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <starpu.h>
+/*
+ * The codelet is passed 3 matrices, the "descr" union-type field gives a
+ * description of the layout of those 3 matrices in the local memory (ie. RAM
+ * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
+ * registered data with the "matrix" data interface, we use the matrix macros.
+ */
+void cpu_mult(void *descr[], void *arg)
+{
+	(void)arg;
+	float *subA, *subB, *subC;
+	/* .blas.ptr gives a pointer to the first element of the local copy */
+	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+
+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
+	 * is the number of lines that are separated by .blas.ld elements (ld
+	 * stands for leading dimension).
+	 * NB: in case some filters were used, the leading dimension is not
+	 * guaranteed to be the same in main memory (on the original matrix)
+	 * and on the accelerator! */
+	const uint32_t nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	const uint32_t nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	const uint32_t nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	const uint32_t ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	const uint32_t ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	const uint32_t ldC = STARPU_MATRIX_GET_LD(descr[2]);
+	/* we assume a FORTRAN-ordering! */
+	int i,j,k,ii,jj,kk;
+	for (i = 0; i < nyC*nxC; i++) subC[i] = 0;
+	//fprintf(stderr,"inside cpu_mult %dx%dx%d %d/%d on %d\n",nyC,nyA,nxC,starpu_worker_get_id(),STARPU_NMAXWORKERS,starpu_worker_get_devid(starpu_worker_get_id()));
+	for (i=0;i<nyC;i+=STRIDE) {
+		for (k=0;k<nyA;k+=STRIDE) {
+			for (j=0;j<nxC;j+=STRIDE) {
+				
+				for (ii = i; ii < i+STRIDE; ii+=2) {
+					float *sC0=subC+ii*ldC+j;
+					float *sC1=subC+ii*ldC+ldC+j;
+					for (kk = k; kk < k+STRIDE; kk+=4) {
+						float alpha00=subB[kk +  ii*ldB];
+						float alpha01=subB[kk+1+ii*ldB];
+						float alpha10=subB[kk+  ii*ldB+ldB];
+						float alpha11=subB[kk+1+ii*ldB+ldB];
+						float alpha02=subB[kk+2+ii*ldB];
+						float alpha03=subB[kk+3+ii*ldB];
+						float alpha12=subB[kk+2+ ii*ldB+ldB];
+						float alpha13=subB[kk+3+ii*ldB+ldB];
+						float *sA0=subA+kk*ldA+j;
+						float *sA1=subA+kk*ldA+ldA+j;
+						float *sA2=subA+kk*ldA+2*ldA+j;
+						float *sA3=subA+kk*ldA+3*ldA+j;
+						for (jj = 0; jj < STRIDE; jj+=1) {
+							sC0[jj] += alpha00*sA0[jj]+alpha01*sA1[jj]+alpha02*sA2[jj]+alpha03*sA3[jj];
+							sC1[jj] += alpha10*sA0[jj]+alpha11*sA1[jj]+alpha12*sA2[jj]+alpha13*sA3[jj];
+						}
+					}
+				}
+			}
+		}
+	}
+	//fprintf(stderr,"inside cpu_mult %dx%dx%d\n",nyC,nyA,nxC);
+
+}
+char* CPU = "cpu_mult";
+char* GPU = "gpu_mult";
+extern char *starpu_find_function(char *name, char *device) {
+	if (!strcmp(device,"gpu")) return GPU;
+	return CPU;
+}

+ 84 - 0
julia/mult/gpu_mult.cu

@@ -0,0 +1,84 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018                                     Alexis Juven
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+extern "C" {
+#include <starpu_cuda.h>
+}
+
+#include <stdint.h>
+#include <stdio.h>
+
+
+__global__ void gpuMultKernel
+(
+		uint32_t nxC, uint32_t nyC, uint32_t nyA,
+		uint32_t ldA, uint32_t ldB, uint32_t ldC,
+		float * subA, float * subB, float * subC
+)
+{
+	uint32_t id, i, j, k;
+	float sum;
+
+	id = blockIdx.x * blockDim.x + threadIdx.x;
+	i = id % nxC;
+	j = id / nxC;
+
+	if (j >= nyC){
+		return;
+	}
+
+	sum = 0.;
+
+	for (k = 0 ; k < nyA ; k++){
+		sum += subA[i + k*ldA] * subB[k + j*ldB];
+	}
+
+	subC[i + j*ldC] = sum;
+
+}
+
+
+
+#define THREADS_PER_BLOCK 64
+extern "C" void gpu_mult(void * descr[], void * args)
+{
+
+	float * d_subA, * d_subB, * d_subC;
+	uint32_t nxC, nyC, nyA;
+	uint32_t ldA, ldB, ldC;
+	uint32_t nblocks;
+
+	d_subA = (float *) STARPU_MATRIX_GET_PTR(descr[0]);
+	d_subB = (float *) STARPU_MATRIX_GET_PTR(descr[1]);
+	d_subC = (float *) STARPU_MATRIX_GET_PTR(descr[2]);
+
+	nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
+
+	gpuMultKernel
+		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
+		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+}