Prechádzať zdrojové kódy

examples/filters/fblock: add CUDA implementation

Nathalie Furmento 15 rokov pred
rodič
commit
fe60c699c6

+ 7 - 2
examples/Makefile.am

@@ -172,10 +172,15 @@ examplebin_PROGRAMS +=				\
 filters_fvector_SOURCES =			\
 	filters/fvector.c
 
-filters_fblock_SOURCES =				\
+filters_fblock_SOURCES =			\
 	filters/fblock.c
 
-filters_fmatrix_SOURCES =				\
+if STARPU_USE_CUDA
+filters_fblock_SOURCES +=			\
+	filters/fblock_cuda.cu
+endif
+
+filters_fmatrix_SOURCES =			\
 	filters/fmatrix.c
 
 ###################

+ 39 - 23
examples/filters/fblock.c

@@ -22,6 +22,13 @@
 #define NZ    3
 #define PARTS 2
 
+void opencl_func(void *buffers[], void *cl_arg)
+{
+        return;
+}
+
+extern void cuda_func(void *buffers[], void *cl_arg);
+
 void cpu_func(void *buffers[], void *cl_arg)
 {
         unsigned i, j, k;
@@ -42,6 +49,22 @@ void cpu_func(void *buffers[], void *cl_arg)
         }
 }
 
+void print_block(int *block, int nx, int ny, int nz)
+{
+        int i, j, k;
+
+        for(k=0 ; k<nz ; k++) {
+                for(j=0 ; j<ny ; j++) {
+                        for(i=0 ; i<nx ; i++) {
+                                fprintf(stderr, "%2d ", block[(k*nx*ny)+(j*nx)+i]);
+                        }
+                        fprintf(stderr,"\n");
+                }
+                fprintf(stderr,"\n");
+        }
+        fprintf(stderr,"\n");
+}
+
 int main(int argc, char **argv)
 {
         int *block,n=0;
@@ -49,30 +72,28 @@ int main(int argc, char **argv)
 
         block = (int*)malloc(NX*NY*NZ*sizeof(block[0]));
         assert(block);
-        fprintf(stderr, "IN  Block\n");
         for(k=0 ; k<NZ ; k++) {
                 for(j=0 ; j<NY ; j++) {
                         for(i=0 ; i<NX ; i++) {
-                                block[(k*NY)+(j*NX)+i] = n++;
-                                fprintf(stderr, "%2d ", block[(k*NY)+(j*NX)+i]);
+                                block[(k*NX*NY)+(j*NX)+i] = n++;
                         }
-                        fprintf(stderr,"\n");
                 }
-                fprintf(stderr,"\n");
         }
-        fprintf(stderr,"\n");
+        fprintf(stderr, "IN  Block\n");
+        print_block(block, NX, NY, NZ);
 
 	starpu_data_handle handle;
 	starpu_codelet cl =
 	{
-		.where = STARPU_CPU,
+		.where = STARPU_CPU|STARPU_CUDA,
                 .cpu_func = cpu_func,
+                .cuda_func = cuda_func,
 		.nbuffers = 1
 	};
         starpu_init(NULL);
 
-	/* Declare data to StarPU */
-	starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
+        /* Declare data to StarPU */
+        starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
 
         /* Partition the block in PARTS sub-blocks */
 	struct starpu_data_filter f =
@@ -82,14 +103,14 @@ int main(int argc, char **argv)
 		.get_nchildren = NULL,
 		.get_child_ops = NULL
 	};
-	starpu_data_partition(handle, &f);
+        starpu_data_partition(handle, &f);
 
         fprintf(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
 
         /* Submit a task on each sub-block */
         for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
         {
-                int multiplier=i;
+                int ret,multiplier=i;
                 struct starpu_task *task = starpu_task_create();
 
                 fprintf(stderr,"Dealing with sub-block %d\n", i);
@@ -100,24 +121,19 @@ int main(int argc, char **argv)
                 task->buffers[0].mode = STARPU_RW;
                 task->cl_arg = &multiplier;
 
-                starpu_task_submit(task);
+                ret = starpu_task_submit(task);
+                if (ret) {
+                        fprintf(stderr, "Error when submitting task\n");
+                        exit(ret);
+                }
         }
 
         /* Unpartition the data, unregister it from StarPU and shutdown */
-	starpu_data_unpartition(handle, 0);
+        starpu_data_unpartition(handle, 0);
         starpu_data_unregister(handle);
 	starpu_shutdown();
 
         /* Print result block */
         fprintf(stderr, "OUT Block\n");
-        for(k=0 ; k<NZ ; k++) {
-                for(j=0 ; j<NY ; j++) {
-                        for(i=0 ; i<NX ; i++) {
-                                fprintf(stderr, "%2d ", block[(k*NY)+(j*NX)+i]);
-                        }
-                        fprintf(stderr,"\n");
-                }
-                fprintf(stderr,"\n");
-        }
-        fprintf(stderr,"\n");
+        print_block(block, NX, NY, NZ);
 }

+ 48 - 0
examples/filters/fblock_cuda.cu

@@ -0,0 +1,48 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example complements vector_scale.c: here we implement a CUDA version.
+ */
+
+#include <starpu.h>
+
+static __global__ void fblock_cuda(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float factor)
+{
+        int i, j, k;
+        for(k=0; k<nz ; k++) {
+                for(j=0; j<ny ; j++) {
+                        for(i=0; i<nx ; i++)
+                                block[(k*ldz)+(j*ldy)+i] = factor;
+                }
+        }
+}
+
+extern "C" void cuda_func(void *buffers[], void *_args)
+{
+        int *factor = (int *)_args;
+	int *block = (int *)STARPU_GET_BLOCK_PTR(buffers[0]);
+	int nx = (int)STARPU_GET_BLOCK_NX(buffers[0]);
+	int ny = (int)STARPU_GET_BLOCK_NY(buffers[0]);
+	int nz = (int)STARPU_GET_BLOCK_NZ(buffers[0]);
+        unsigned ldy = STARPU_GET_BLOCK_LDY(buffers[0]);
+        unsigned ldz = STARPU_GET_BLOCK_LDZ(buffers[0]);
+
+        /* TODO: use more blocks and threads in blocks */
+        fblock_cuda<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *factor);
+
+	cudaThreadSynchronize();
+}