Browse Source

julia: Update examples for CUDA.

Pierre Huchant 5 years ago
parent
commit
284827fd8b

+ 28 - 17
julia/mandelbrot/Makefile

@@ -1,38 +1,49 @@
 CC=gcc
-CFLAGS += -Wall -Wextra -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
+
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
 
-LDFLAGS +=$(shell pkg-config --libs starpu-1.3) -lm
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
-#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-OBJECTS=$(wildcard gen*.c)
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
 LIBPATH=${PWD}/../StarPU.jl/lib
 
 all: ${EXTERNLIB}
 
 mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-gpu_mandelbrot.o: gpu_mandelbrot.cu
-	nvcc -c $(CFLAGS) $^ -o $@
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: cpu_mandelbrot.c
 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
-gpu_mandelbrot.so: gpu_mandelbrot.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
 
-cpu_mandelbrot_sa: cpu_mandelbrot_sa.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+.PHONY: clean
 
 clean:
-	rm -f mandelbrot *.so *.o c_*.genc gencuda_*.cu *.dat
+	rm -f mandelbrot *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: mandelbrot

+ 3 - 2
julia/mandelbrot/mandelbrot.jl

@@ -3,7 +3,7 @@ using StarPU
 using LinearAlgebra
 
 @target STARPU_CPU+STARPU_CUDA
-@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Float32
+@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Nothing
     height :: Int64 = height(pixels)
     width :: Int64 = width(pixels)
     zoom :: Float64 = width * 0.25296875
@@ -43,7 +43,8 @@ using LinearAlgebra
             end
         end
     end
-    return 0. :: Float32
+
+    return
 end
 
 @debugprint "starpu_init"

+ 28 - 16
julia/mult/Makefile

@@ -6,40 +6,52 @@ STRIDE=72
 #CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
 # GCC compiler
 CC=gcc
-CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
 
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g -DSTRIDE=${STRIDE} $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
-#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
-OBJECTS=$(wildcard gen*.c)
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
+
 LIBPATH=${PWD}/../StarPU.jl/lib
 
 all: ${EXTERNLIB}
 
 mult: mult.c cpu_mult.o #gpu_mult.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-gpu_mult.o: gpu_mult.cu
-	nvcc -c $(CFLAGS) $^ -o $@
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: cpu_mult.c
 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
-gpu_mult.so: gpu_mult.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
 
-cpu_mult_sa: cpu_mult_sa.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+.PHONY: clean
 
 clean:
-	rm -f mult *.so *.o c_*.genc gencuda_*.cu *.dat
+	rm -f mult *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: mult

+ 42 - 42
julia/mult/mult.jl

@@ -6,54 +6,54 @@ using LinearAlgebra
 const STRIDE = 72
 
 @target STARPU_CPU+STARPU_CUDA
-@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Float32
+@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
 
     width_m2 :: Int32 = width(m2)
     height_m1 :: Int32 = height(m1)
     width_m1 :: Int32 = width(m1)
     # Naive version
-    #@parallel for j in (1 : width_m2)
-    #    @parallel for i in (1 : height_m1)
-    #
-    #          sum :: Float32 = 0.
-
-    #          for k in (1 : width_m1)
-    #              sum = sum + m1[i, k] * m2[k, j]
-    #          end
+    @parallel for j in (1 : width_m2)
+       @parallel for i in (1 : height_m1)
     
-    #          m3[i, j] = sum
-    #      end
-    #  end
-    ##### Tiled and unrolled version 
-    for l in (1 : width_m2)
-        for m in (1 : height_m1)
-            m3[m,l] = 0
-        end
-    end
-    @parallel for i in (1 : STRIDE : height_m1)
-        for k in (1 : STRIDE : width_m1 )
-            for j in (1 : STRIDE : width_m2  )
-                for kk in (k : 4 : k+STRIDE-1)
-                    for jj in (j : 2 : j+STRIDE-1)
-                        alpha00 :: Float32 =m2[kk,jj]
-                        alpha01 :: Float32 =m2[kk,jj+1]
-                        alpha10 :: Float32 =m2[kk+1,jj]
-                        alpha11 :: Float32 =m2[kk+1,jj+1]
-                        alpha20 :: Float32 =m2[kk+2,jj]
-                        alpha21 :: Float32 =m2[kk+2,jj+1]
-                        alpha30 :: Float32 =m2[kk+3,jj]
-                        alpha31 :: Float32 =m2[kk+3,jj+1]
-                        for ii in (i : 1 : i+STRIDE-1) 
-                            m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
-                            m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
-                        end
-                    end
-                end
-            end
-        end
-    end
+             sum :: Float32 = 0.
 
-    return 0. :: Float32
+             for k in (1 : width_m1)
+                 sum = sum + m1[i, k] * m2[k, j]
+             end
+    
+             m3[i, j] = sum
+         end
+     end
+    # ##### Tiled and unrolled version 
+    # for l in (1 : width_m2)
+    #     for m in (1 : height_m1)
+    #         m3[m,l] = 0
+    #     end
+    # end
+    # @parallel for i in (1 : STRIDE : height_m1)
+    #     for k in (1 : STRIDE : width_m1 )
+    #         for j in (1 : STRIDE : width_m2  )
+    #             for kk in (k : 4 : k+STRIDE-1)
+    #                 for jj in (j : 2 : j+STRIDE-1)
+    #                     alpha00 :: Float32 =m2[kk,jj]
+    #                     alpha01 :: Float32 =m2[kk,jj+1]
+    #                     alpha10 :: Float32 =m2[kk+1,jj]
+    #                     alpha11 :: Float32 =m2[kk+1,jj+1]
+    #                     alpha20 :: Float32 =m2[kk+2,jj]
+    #                     alpha21 :: Float32 =m2[kk+2,jj+1]
+    #                     alpha30 :: Float32 =m2[kk+3,jj]
+    #                     alpha31 :: Float32 =m2[kk+3,jj+1]
+    #                     for ii in (i : 1 : i+STRIDE-1) 
+    #                         m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
+    #                         m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
+    #                     end
+    #                 end
+    #             end
+    #         end
+    #     end
+    # end
+
+    return
 end
 
 
@@ -77,7 +77,7 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
         )
         cl = StarpuCodelet(
             cpu_func = CPU_CODELETS["matrix_mult"],
-            #cuda_func = "matrix_mult",
+            # cuda_func = CUDA_CODELETS["matrix_mult"],
             #opencl_func="ocl_matrix_mult",
             modes = [STARPU_R, STARPU_R, STARPU_W],
             perfmodel = perfmodel

+ 27 - 19
julia/vector_scal/Makefile

@@ -1,41 +1,49 @@
-# ICC compiler
-#CC =icc
-#CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
-# GCC compiler
 CC=gcc
-CFLAGS += -g -O3 -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
+NVCC=nvcc
+ENABLE_CUDA=no
+LD=$(CC)
 
+ifeq ($(ENABLE_CUDA),yes)
+        LD := ${NVCC}
+endif
+
+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
+CUDA_CFLAGS = ${CFLAGS}
 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
+
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
-OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+
+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
+ifneq ($(ENABLE_CUDA),yes)
+	CUDA_OBJECTS:=
+endif
+
 LIBPATH=${PWD}/../StarPU.jl/lib
 
 all: ${EXTERNLIB}
 
 vector_scal: vector_scal.c cpu_vector_scal.o #gpu_vector_scal.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-gpu_vector_scal.o: gpu_vector_scal.cu
-	nvcc -c $(CFLAGS) $^ -o $@
+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.o: %.c
-	$(CC) -c $(CFLAGS) $^ -o $@
+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
+
+%.o: %.cu
+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
 
 ${EXTERNLIB}: cpu_vector_scal.c
 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
 
-gpu_vector_scal.so: gpu_vector_scal.o
-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
+	$(LD) -shared $(LDFLAGS) $^ -o $@
 
-cpu_vector_scal_sa: cpu_vector_scal_sa.o
-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
-
-${GENERATEDLIB}: ${OBJECTS}
-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
+PHONY: clean
 
 clean:
-	rm -f vector_scal *.so *.o c_*.genc gencuda_*.cu *.dat
+	rm -f vector_scal *.so *.o genc_*.c gencuda_*.cu *.dat
 
 # Performance Tests
 cstarpu.dat: vector_scal

+ 3 - 3
julia/vector_scal/cpu_vector_scal.c

@@ -9,7 +9,7 @@ struct params {
   float l;
 };
 
-float vector_scal(void *buffers[], void *cl_arg)
+float cpu_vector_scal(void *buffers[], void *cl_arg)
 {
   /* get scalar parameters from cl_arg */
   struct params *scalars = (struct params *) cl_arg;
@@ -34,8 +34,8 @@ float vector_scal(void *buffers[], void *cl_arg)
   return 0.0;
 }
 
-char* CPU = "cpu_mult";
-char* GPU = "gpu_mult";
+char* CPU = "cpu_vector_scal";
+char* GPU = "gpu_vector_scal";
 extern char *starpu_find_function(char *name, char *device) {
 	if (!strcmp(device,"gpu")) return GPU;
 	return CPU;

+ 3 - 5
julia/vector_scal/vector_scal.jl

@@ -8,10 +8,8 @@ using LinearAlgebra
     N :: Int32 = length(v)
     # Naive version
     @parallel for i in (1 : N)
-        v[i] = v[i] * k + l + m
+        v[i] = v[i] * m + l + k
     end
-
-    return 0. :: Float32
 end
 
 
@@ -30,13 +28,13 @@ function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32,
         )
         cl = StarpuCodelet(
             cpu_func = CPU_CODELETS["vector_scal"],
-            #cuda_func = "matrix_mult",
+            # cuda_func = CUDA_CODELETS["vector_scal"],
             #opencl_func="ocl_matrix_mult",
             modes = [STARPU_RW],
             perfmodel = perfmodel
         )
 
-        for i in (1 : 10)
+        for i in (1 : 1)
             t=time_ns()
             @starpu_sync_tasks begin
                 handles = [hV]