瀏覽代碼

julia: Add native starpu and native julia versions of Mandelbrot.

phuchant 5 年之前
父節點
當前提交
c96b2c2dda
共有 5 個文件被更改,包括 262 次插入237 次删除
  1. 47 0
      julia/mandelbrot/Makefile
  2. 28 24
      julia/mandelbrot/cpu_mandelbrot.c
  3. 0 35
      julia/mandelbrot/makefile
  4. 81 178
      julia/mandelbrot/mandelbrot.c
  5. 106 0
      julia/mandelbrot/mandelbrot_native.jl

+ 47 - 0
julia/mandelbrot/Makefile

@@ -0,0 +1,47 @@
+CC=gcc
+CFLAGS += -Wall -Wextra -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3) -lm
+EXTERNLIB=extern_tasks.so
+GENERATEDLIB=generated_tasks.so
+#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+OBJECTS=$(wildcard gen*.c)
+LIBPATH=${PWD}/../StarPU.jl/lib
+
+all: ${EXTERNLIB}
+
+mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+
+gpu_mandelbrot.o: gpu_mandelbrot.cu
+	nvcc -c $(CFLAGS) $^ -o $@
+
+%.o: %.c
+	$(CC) -c $(CFLAGS) $^ -o $@
+
+${EXTERNLIB}: cpu_mandelbrot.c
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+gpu_mandelbrot.so: gpu_mandelbrot.o
+	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+
+cpu_mandelbrot_sa: cpu_mandelbrot_sa.o
+	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+
+${GENERATEDLIB}: ${OBJECTS}
+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+
+clean:
+	rm -f mandelbrot *.so *.o c_*.genc gencuda_*.cu *.dat
+
+# Performance Tests
+cstarpu.dat: mandelbrot
+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mandelbrot -0.800671 -0.158392 32 32 4096 4 > $@
+julia_generatedc.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl $@
+julia_native.dat:
+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot_native.jl $@
+julia_calllib.dat: ${EXTERNLIB}
+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl julia_calllib.dat
+
+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat

+ 28 - 24
julia/mandelbrot/cpu_mandelbrot.c

@@ -4,46 +4,50 @@
 
 void cpu_mandelbrot(void *descr[], void *cl_arg)
 {
-        long long int *pixels;
-	float *params;
+        long long *pixels;
+        float *params;
 
         pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
-	params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
-
-        int width = STARPU_MATRIX_GET_NX(descr[0]);
-        int height = STARPU_MATRIX_GET_NY(descr[0]);
-        
-        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
+        params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 
+        long long width = STARPU_MATRIX_GET_NY(descr[0]);
+        long long height = STARPU_MATRIX_GET_NX(descr[0]);
+        double zoom = width * 0.25296875;
+        double iz = 1. / zoom;
+        float diverge = 4.0;
+        float max_iterations = (width/2) * 0.049715909 * log10(zoom);
+        float imi = 1. / max_iterations;
         float centerr = params[0];
         float centeri = params[1];
         float offset = params[2];
         float dim = params[3];
-        float zoom = width * 0.25296875;
-        float diverge = 4.0;
-        int max_iter = (width/2) * 0.049715909 * log10(zoom);
+        double cr = 0;
+        double zr = 0;
+        double ci = 0;
+        double zi = 0;
+        long long n = 0;
+        double tmp = 0;
+        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
 
-        int x,y,n;
+        long long x,y;
 
         for (y = 0; y < height; y++){
                 for (x = 0; x < width; x++){
-                        float cr = centerr + (x - (dim/2))/zoom;
-                        float ci = centeri + (y+offset - (dim/2))/zoom;
-                        float zr = cr;
-                        float zi = ci;
-                        
-                        for (n = 0; n <= max_iter; n++) {
+                        cr = centerr + (x - (dim/2)) * iz;
+			zr = cr;
+                        ci = centeri + (y+offset - (dim/2)) * iz;
+                        zi = ci;
+
+                        for (n = 0; n <= max_iterations; n++) {
 				if (zr*zr + zi*zi>diverge) break;
-                                float tmp = zr*zr - zi*zi + cr;
+                                tmp = zr*zr - zi*zi + cr;
                                 zi = 2*zr*zi + ci;
                                 zr = tmp;
                         }
-			int color;
-			if (n<max_iter)
-				color = round(15.*n/max_iter);
+			if (n<max_iterations)
+				pixels[y +x*ldP] = round(15.*n*imi);
 			else
-				color = 0;
-			pixels[x*ldP + y] = color;
+				pixels[y +x*ldP] = 0;
 		}
 	}
 }

+ 0 - 35
julia/mandelbrot/makefile

@@ -1,35 +0,0 @@
-# GCC compiler
-CC=gcc-9
-CFLAGS += -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
-
-LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
-EXTERNLIB=extern_tasks.dylib
-GENERATEDLIB=generated_tasks.dylib
-OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-LIBPATH=${PWD}/../StarPU.jl/lib
-
-all: ${EXTERNLIB} 
-
-mult: mult.c cpu_mult.o #gpu_mult.o 
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)	
-
-gpu_mult.o: gpu_mult.cu
-	nvcc -c $(CFLAGS) $^ -o $@
-
-%.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
-
-${EXTERNLIB}: cpu_mandelbrot.o
-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@  
-
-gpu_mult.so: gpu_mult.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@
-
-clean:
-	rm *.so *.o *.dylib c_*.genc gencuda_*.cu *.dat
-
-
-

+ 81 - 178
julia/mandelbrot/mandelbrot.c

@@ -16,43 +16,33 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <starpu.h>
-#include "../display.h"
 
 void cpu_mandelbrot(void **, void *);
 void gpu_mandelbrot(void **, void *);
 
-struct Params
+static struct starpu_perfmodel model =
 {
-	float cr;
-	float ci;
-	unsigned taskx;
-	unsigned tasky;
-	unsigned width;
-	unsigned height;
+		.type = STARPU_HISTORY_BASED,
+		.symbol = "history_perf"
 };
 
-
-
-struct starpu_codelet cl =
+static struct starpu_codelet cl =
 {
 	.cpu_funcs = {cpu_mandelbrot},
-	.cuda_funcs = {gpu_mandelbrot},
-	.nbuffers = 1,
-	.modes = {STARPU_RW}
+	//.cuda_funcs = {gpu_mandelbrot},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R},
+	.model = &model
 };
 
 
-void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy)
+void mandelbrot_with_starpu(long long *pixels, float *params, long long dim, long long nslicesx)
 {
-	starpu_data_handle_t p_handle;
-
-	starpu_matrix_data_register(&p_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, width, width, height, sizeof(int));
+	starpu_data_handle_t pixels_handle;
+	starpu_data_handle_t params_handle;
 
-	struct starpu_data_filter vert =
-	{
-		.filter_func = starpu_matrix_filter_vertical_block,
-		.nchildren = nslicesy
-	};
+	starpu_matrix_data_register(&pixels_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, dim, dim, dim, sizeof(long long));
+	starpu_matrix_data_register(&params_handle, STARPU_MAIN_RAM, (uintptr_t)params, 4*nslicesx, 4*nslicesx, 1, sizeof(float));
 
 	struct starpu_data_filter horiz =
 	{
@@ -60,179 +50,100 @@ void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, uns
 		.nchildren = nslicesx
 	};
 
-	starpu_data_map_filters(p_handle, 2, &vert, &horiz);
+	starpu_data_partition(pixels_handle, &horiz);
+	starpu_data_partition(params_handle, &horiz);
 
-	unsigned taskx, tasky;
-
-	struct Params *params = malloc(nslicesx*nslicesy*sizeof(struct Params));
+	long long taskx;
 
 	for (taskx = 0; taskx < nslicesx; taskx++){
-		for (tasky = 0; tasky < nslicesy; tasky++){
-			struct starpu_task *task = starpu_task_create();
-			
-			task->cl = &cl;
-			task->handles[0] = starpu_data_get_sub_data(p_handle, 2, tasky, taskx);
-			struct Params param = {cr, ci, taskx, tasky, width, height};
-
-			params[taskx + tasky*nslicesx] = param;
-
-			task->cl_arg = (params + taskx + tasky * nslicesx);
-			task->cl_arg_size = sizeof(struct Params);
-			
-			starpu_task_submit(task);
-		}
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &cl;
+		task->handles[0] = starpu_data_get_child(pixels_handle, taskx);
+		task->handles[1] = starpu_data_get_child(params_handle, taskx);
+		if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
 	}
-	starpu_task_wait_for_all();
 
-	starpu_data_unpartition(p_handle, STARPU_MAIN_RAM);
+	starpu_task_wait_for_all();
 
-	starpu_data_unregister(p_handle);
+	starpu_data_unpartition(pixels_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(params_handle, STARPU_MAIN_RAM);
 
-	free(params);
+	starpu_data_unregister(pixels_handle);
+	starpu_data_unregister(params_handle);
 }
 
-void init_zero(int * pixels, unsigned width, unsigned height)
+void pixels2img(long long *pixels, long long width, long long height, const char *filename)
 {
-	unsigned i,j;
-	for (i = 0; i < height; i++){
-		for (j = 0; j < width; j++){
-			pixels[j + i*width] = 0;
-		}
-	}
-}
+  FILE *fp = fopen(filename, "w");
+  if (!fp)
+    return;
 
-void sort(double *arr, unsigned nbr_tests)
-{
-	unsigned j;
-	
-	int is_sort = 0;
-	
-	while (!is_sort){
-
-		is_sort = 1;
-		
-		for (j = 0; j < nbr_tests - 1; j++){
-			if (arr[j] > arr[j+1]){
-				is_sort = 0;
-				double tmp = arr[j];
-				arr[j] = arr[j+1];
-				arr[j+1] = tmp;
-			}
-		}
-	}
-}
-double median_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests)
-{
-	int *Pixels = malloc(width*height*sizeof(int));
-	
-	unsigned i;
+  int MAPPING[16][3] = {{66,30,15},{25,7,26},{9,1,47},{4,4,73},{0,7,100},{12,44,138},{24,82,177},{57,125,209},{134,181,229},{211,236,248},{241,233,191},{248,201,95},{255,170,0},{204,128,0},{153,87,0},{106,52,3}};
 
-	double exec_times[nbr_tests];
+  fprintf(fp, "P3\n%lld %lld\n255\n", width, height);
+  long long i, j;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      fprintf(fp, "%d %d %d ", MAPPING[pixels[j*width+i]][0], MAPPING[pixels[j*width+i]][1], MAPPING[pixels[j*width+i]][2]);
+    }
+  }
 
-	double start, stop, exec_t;
-	for (i = 0; i < nbr_tests; i++){
-		init_zero(Pixels, width, height);
-		
-		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
-		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
-		stop = starpu_timing_now();
-		
-		exec_t = (stop-start)/1.e6;
-		exec_times[i] = exec_t;
-	}
-	char filename[30];
-	sprintf(filename, "PPM/mandelbrot%d.ppm", width);
-	printf("%s\n", filename);
-
-	mandelbrot_graph(filename, Pixels, width, height);
-
-	free(Pixels);
-
-	sort(exec_times, nbr_tests);
-
-	return exec_times[nbr_tests/2];	
+  fclose(fp);
 }
 
-void fluctuation_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests, double *exec_times)
+double min_times(double cr, double ci, long long dim, long long nslices)
 {
-	int *Pixels = malloc(width*height*sizeof(int));
-	
-	unsigned i;
+	long long *pixels = calloc(dim*dim, sizeof(long long));
+	float *params = calloc(4*nslices, sizeof(float));
+
+	double t_min = 0;
+	long long i;
+
+	for (i=0; i<nslices; i++) {
+		params[4*i+0] = cr;
+		params[4*i+1] = ci;
+		params[4*i+2] = i*dim/nslices;
+		params[4*i+3] = dim;
+	}
 
 	double start, stop, exec_t;
-	for (i = 0; i < nbr_tests; i++){
-		init_zero(Pixels, width, height);
-		
+	for (i = 0; i < 10; i++){
 		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
-		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
+		mandelbrot_with_starpu(pixels, params, dim, nslices);
 		stop = starpu_timing_now();
-		
-		exec_t = (stop-start)/1.e6;
-		exec_times[i] = exec_t;
-
-		/* char filename[33]; */
-		/* sprintf(filename, "../PPM/mandelbrot%d.ppm", i + 1); */
-		/* printf("%s\n", filename); */
-		/* mandelbrot_graph(filename, Pixels, width, height); */
+		exec_t = (stop-start)*1.e3;
+		if (t_min==0 || t_min>exec_t)
+		  t_min = exec_t;
 	}
 
+	char filename[64];
+	snprintf(filename, 64, "out%lld.ppm", dim);
+	pixels2img(pixels,dim,dim,filename);
 
-	free(Pixels);
-
-
+	free(pixels);
+	free(params);
 
-	
+	return t_min;
 }
 
-
-void display_times(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
+void display_times(double cr, double ci, long long start_dim, long long step_dim, long long stop_dim, long long nslices)
 {
-	
-	unsigned dim;
-
-	FILE *myfile;
-	myfile = fopen("DAT/mandelbrot_c_struct_times.dat", "w");
-
-	for (dim = start_dim; dim <= stop_dim; dim += step_dim){
-		printf("Dimension: %u...\n", dim);
-		double t = median_time(cr, ci, dim, dim, nslices, nslices, nbr_tests);
-		
-		printf("w = %u ; h = %u ; t = %f\n", dim, dim, t);
-		
-		fprintf(myfile, "%f\n", t);
-		}
-	
-	fclose(myfile);
-}
 
-void display_fluctuations(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
-{
-	
-	unsigned dim;
-
-	FILE *myfile;
-	myfile = fopen("DAT/mandelbrot_c_fluctuation.dat", "w");
-
-	double *exec_times = malloc(nbr_tests * sizeof(double));
-	fluctuation_time(cr, ci, start_dim, start_dim, nslices, nslices, nbr_tests, exec_times);
-		
-	/* printf("w = %u ; h = %u ; t = %f\n", dim, dim, t); */
-	unsigned i;
-	for (i = 0; i < nbr_tests; i++){
-		printf("test %u: %f seconds\n", i, exec_times[i]);
-		fprintf(myfile, "%u %f\n", i, exec_times[i]);
+	long long dim;
+
+	for (dim = start_dim; dim <= stop_dim; dim += step_dim) {
+		printf("Dimension: %lld...\n", dim);
+		double res = min_times(cr, ci, dim, nslices);
+		res = res / dim / dim; // time per pixel
+		printf("%lld %lf\n", dim, res);
 	}
-	
-	fclose(myfile);
-	free(exec_times);
 }
 
-
 int main(int argc, char **argv)
 {
-
-	if (argc != 8){
-		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) nbr_tests\n", argv[0]);
+	if (argc != 7){
+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims)\n", argv[0]);
 		return 1;
 	}
 	if (starpu_init(NULL) != EXIT_SUCCESS){
@@ -240,24 +151,16 @@ int main(int argc, char **argv)
 		return 77;
 	}
 
+	double cr = (float) atof(argv[1]);
+	double ci = (float) atof(argv[2]);
+	long long start_dim = atoll(argv[3]);
+	long long step_dim = atoll(argv[4]);
+	long long stop_dim = atoll(argv[5]);
+	long long nslices = atoll(argv[6]);
 
-	
-	float cr = (float) atof(argv[1]);
-	float ci = (float) atof(argv[2]);
-	unsigned start_dim = (unsigned) atoi(argv[3]);
-	unsigned step_dim = (unsigned) atoi(argv[4]);	
-	unsigned stop_dim = (unsigned) atoi(argv[5]);
-	unsigned nslices = (unsigned) atoi(argv[6]);
-	unsigned nbr_tests = (unsigned) atoi(argv[7]);
-
-	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests);
-	
-	
-	/* display_fluctuations(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests); */
-
+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices);
 
 	starpu_shutdown();
 
-
 	return 0;
 }

+ 106 - 0
julia/mandelbrot/mandelbrot_native.jl

@@ -0,0 +1,106 @@
+using LinearAlgebra
+
+function mandelbrot(pixels, params) :: Float32
+    height :: Int64, width :: Int64 = size(pixels)
+    zoom :: Float64 = width * 0.25296875
+    iz :: Float64 = 1. / zoom
+    diverge :: Float32 = 4.0
+    max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
+    imi :: Float32 = 1. / max_iterations
+    centerr :: Float32 = params[1]
+    centeri :: Float32 = params[2]
+    offset :: Float32 = params[3]
+    dim :: Float32 = params[4]
+    cr :: Float64 = 0.
+    zr :: Float64 = 0.
+    ci :: Float64 = 0.
+    zi :: Float64 = 0.
+    n :: Int64 = 0
+    tmp :: Float64 = 0.
+    for y = 1:height
+        for x = 1:width
+            cr = centerr + (x-1 - (dim / 2)) * iz
+            zr = cr
+            ci = centeri + (y-1+offset - (dim / 2)) * iz
+            zi = ci
+            for n = 0:max_iterations
+                if (zr*zr + zi*zi > diverge)
+                    break
+                end
+                tmp = zr*zr - zi*zi + cr
+                zi = 2*zr*zi + ci
+                zr = tmp
+            end
+
+            if (n < max_iterations)
+                pixels[y,x] = round(15 * n * imi)
+            else
+                pixels[y,x] = 0
+            end
+        end
+    end
+
+    ret :: Float32 = 0.
+    return ret
+end
+
+function mandelbrot_without_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
+    width,height = size(A)
+    step = height / nslicesx
+
+    for taskx in (1 : nslicesx)
+        start_id = floor(Int64, (taskx-1)*step+1)
+        end_id = floor(Int64, (taskx-1)*step+step)
+        a = view(A, start_id:end_id, :)
+        p = view(params, (taskx-1)*4+1:(taskx-1)*4+4)
+
+        mandelbrot(a, p)
+    end
+end
+
+function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
+    MAPPING = [[66,30,15],[25,7,26],[9,1,47],[4,4,73],[0,7,100],[12,44,138],[24,82,177],[57,125,209],[134,181,229],[211,236,248],[241,233,191],[248,201,95],[255,170,0],[204,128,0],[153,87,0],[106,52,3]]
+    open(filename, "w") do f
+        write(f, "P3\n$width $height\n255\n")
+        for i = 1:height
+            for j = 1:width
+                write(f,"$(MAPPING[1+pixels[i,j]][1]) $(MAPPING[1+pixels[i,j]][2]) $(MAPPING[1+pixels[i,j]][3]) ")
+            end
+            write(f, "\n")
+        end
+    end
+end
+
+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
+    tmin=0;
+
+    pixels ::Matrix{Int64} = zeros(dim, dim)
+    params :: Matrix{Float32} = zeros(4*nslices,1)
+    for i=0:(nslices-1)
+        params[4*i+1,1] = cr
+        params[4*i+2,1] = ci
+        params[4*i+3,1] = i*dim/nslices
+        params[4*i+4,1] = dim
+    end
+    for i = 1:10
+        t = time_ns();
+        mandelbrot_without_starpu(pixels, params, nslices)
+        t = time_ns()-t
+        if (tmin==0 || tmin>t)
+            tmin=t
+        end
+    end
+    pixels2img(pixels,dim,dim,"out$(dim).ppm")
+    return tmin
+end
+
+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
+    for dim in (start_dim : step_dim : stop_dim)
+        res = min_times(cr, ci, dim, nslices)
+        res=res/dim/dim; # time per pixel
+        println("$(dim) $(res)")
+    end
+end
+
+
+display_time(-0.800671,-0.158392,32,32,4096,4)