Browse Source

document the basic examples so that they should be self-sufficient

Cédric Augonnet 16 years ago
parent
commit
c37b910143

+ 3 - 1
examples/basic-examples/hello-world.c

@@ -30,7 +30,9 @@
 #include <stdint.h>
 #include <starpu.h>
 
-/* When the task is done, call task->callback_func(task->callback_arg) */
+/* When the task is done, call task->callback_func(task->callback_arg). Any
+ * callback function must have the prototype void (*)(void *).
+ * NB: Callback are NOT allowed to performed potentially blocking operations */
 void callback_func(void *callback_arg)
 {
 	printf("Callback function got argument %p\n", callback_arg);

+ 176 - 18
examples/basic-examples/mult.c

@@ -14,6 +14,22 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+/*
+ * This example shows a simple implementation of a blocked matrix
+ * multiplication. Note that this is NOT intended to be an efficient
+ * implementation of sgemm ! In this example, we show:
+ *  - how to declare dense matrices (starpu_register_blas_data)
+ *  - how to manipulate matrices within codelets (eg. descr[0].blas.ld)
+ *  - how to use filters to partition the matrices into blocks
+ *    (starpu_partition_data and starpu_map_filters)
+ *  - how to unpartition data (starpu_unpartition_data) and how to stop
+ *    monitoring data (starpu_delete_data)
+ *  - how to manipulate subsets of data (get_sub_data)
+ *  - how to construct an autocalibrated performance model (starpu_perfmodel_t)
+ *  - how to submit asynchronous tasks and how to use callback to handle task
+ *    termination
+ */
+
 #include <string.h>
 #include <math.h>
 #include <sys/types.h>
@@ -60,15 +76,24 @@ static unsigned zdim = 512;
 
  */
 
-void callback_func(void *arg)
+static void callback_func(void *arg)
 {
 	/* the argument is a pointer to a counter of the remaining tasks */
 	int *counterptr = arg;
 
+	/* counterptr points to a variable with the number of remaining tasks,
+ 	 * when it reaches 0, all tasks are done */
 	int counter = STARPU_ATOMIC_ADD(counterptr, -1);
 	if (counter == 0)
 	{
-		/* we are done */	
+		/* IMPORTANT : note that we CANNOT call blocking operations
+		 * within callbacks as it may lead to a deadlock of StarPU.
+		 * starpu_unpartition_data is for instance called by the main
+		 * thread since it may cause /potentially/ blocking operations
+		 * such as memory transfers from a GPU to a CPU. */
+		
+		/* wake the application to notify the termination of all the
+ 		 * tasks */
 		pthread_mutex_lock(&mutex);
 		terminated = 1;
 		pthread_cond_signal(&cond);
@@ -76,18 +101,31 @@ void callback_func(void *arg)
 	}
 }
 
+/*
+ * The codelet is passed 3 matrices, the "descr" union-type field gives a
+ * description of the layout of those 3 matrices in the local memory (ie. RAM
+ * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
+ * registered data with the "blas" data interface, we manipulate the .blas
+ * field of the descr[x] elements which are union types.
+ */
 
-
-void cpu_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+static void cpu_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
 {
 	float *subA, *subB, *subC;
 	uint32_t nxC, nyC, nyA;
 	uint32_t ldA, ldB, ldC;
 
+	/* .blas.ptr gives a pointer to the first element of the local copy */
 	subA = (float *)descr[0].blas.ptr;
 	subB = (float *)descr[1].blas.ptr;
 	subC = (float *)descr[2].blas.ptr;
 
+	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
+	 * is the number of lines that are separated by .blas.ld elements (ld
+	 * stands for leading dimension).
+	 * NB: in case some filters were used, the leading dimension is not
+	 * guaranteed to be the same in main memory (on the original matrix)
+	 * and on the accelerator ! */
 	nxC = descr[2].blas.nx;
 	nyC = descr[2].blas.ny;
 	nyA = descr[0].blas.ny;
@@ -109,7 +147,7 @@ void cpu_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg
 				sum += subA[j+k*ldA]*subB[k+i*ldB];
 			}
 
-			subC[j + i*ldC] += sum;
+			subC[j + i*ldC] = sum;
 		}
 	}
 }
@@ -118,12 +156,14 @@ static void init_problem_data(void)
 {
 	unsigned i,j;
 
+	/* we initialize matrices A, B and C in the usual way */
+
 	A = malloc(zdim*ydim*sizeof(float));
 	B = malloc(xdim*zdim*sizeof(float));
 	C = malloc(xdim*ydim*sizeof(float));
 
 	/* fill the A and B matrices */
-	srand(2008);
+	srand(2009);
 	for (j=0; j < ydim; j++) {
 		for (i=0; i < zdim; i++) {
 			A[j+i*ydim] = (float)(drand48());
@@ -145,6 +185,20 @@ static void init_problem_data(void)
 
 static void partition_mult_data(void)
 {
+	/* note that we assume a FORTRAN ordering here ! */
+
+	/* The BLAS data interface is described by 4 parameters: 
+	 *  - the location of the first element of the matrix to monitor (3rd
+	 *    argument)
+	 *  - the number of elements per row, ie. contiguous elements (4th arg)
+	 *  - the number of elements between columns, aka leading dimension
+	 *    (5th arg)
+	 *  - the number of columns (6th arg)
+	 * The first elements is a pointer to the data_handle that will be
+	 * associated to the matrix, and the second elements gives the memory
+	 * node in which resides the matrix: 0 means that the 3rd argument is
+	 * an adress in main memory.
+	 */
 	starpu_register_blas_data(&A_handle, 0, (uintptr_t)A, 
 		ydim, ydim, zdim, sizeof(float));
 	starpu_register_blas_data(&B_handle, 0, (uintptr_t)B, 
@@ -152,17 +206,69 @@ static void partition_mult_data(void)
 	starpu_register_blas_data(&C_handle, 0, (uintptr_t)C, 
 		ydim, ydim, xdim, sizeof(float));
 
-	starpu_filter f;
-	f.filter_func = starpu_vertical_block_filter_func;
-	f.filter_arg = nslicesx;
+	/* A filter is a method to partition a data into disjoint chunks, it is
+	 * described by the means of the "starpu_filter" structure that
+	 * contains a function that is applied on a data handle to partition it
+	 * into smaller chunks, and an argument that is passed to the function
+	 * (eg. the number of blocks to create here).
+	 */
+
+	/* StarPU supplies some basic filters such as the partition of a matrix
+	 * into blocks, note that we are using a FORTRAN ordering so that the
+	 * name of the filters are a bit misleading */
+	starpu_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.filter_arg = nslicesx
+	};
 		
-	starpu_filter f2;
-	f2.filter_func = starpu_block_filter_func;
-	f2.filter_arg = nslicesy;
+	starpu_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.filter_arg = nslicesy
+	};
 		
+/*
+ *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
+ *	sub-data by using the "get_sub_data" method, for instance:
+ *
+ *		A' handle is get_sub_data(A_handle, 1, 1); 
+ *		B' handle is get_sub_data(B_handle, 1, 2); 
+ *		C' handle is get_sub_data(C_handle, 2, 2, 1); 
+ *
+ *	Note that since we apply 2 filters recursively onto C,
+ *	"get_sub_data(C_handle, 1, 3)" returns an handle to the 4th column of
+ *	blocked matrix C for example.
+ * 
+ *		              |---|---|---|---|
+ *		              |   |   | B'|   | B
+ *		              |---|---|---|---|
+ *		                0   1   2   3
+ *		     |----|   |---|---|---|---|
+ *		     |    |   |   |   |   |   |
+ *		     |    | 0 |   |   |   |   |
+ *		     |----|   |---|---|---|---|
+ *		     | A' |   |   |   | C'|   |
+ *		     |    |   |   |   |   |   |
+ *		     |----|   |---|---|---|---|
+ *		       A              C
+ *
+ *	IMPORTANT: applying filters is equivalent to partitionning a piece of
+ *	data in a hierarchical manner, so that memory consistency is enforced
+ *	for each of the elements independantly. The tasks should therefore NOT
+ *	access inner nodes (eg. one column of C or the whole C) but only the
+ *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
+ *	possible by disapplying the filters (using starpu_unpartition_data).
+ */
+
 	starpu_partition_data(B_handle, &f);
 	starpu_partition_data(A_handle, &f2);
 
+	/* starpu_map_filters is a variable-arity function, the first argument
+	 * is the handle of the data to partition, the second argument is the
+	 * number of filters to apply recursively. Filters are applied in the
+	 * same order as the arguments.
+	 * This would be equivalent to starpu_partition_data(C_handle, &f) and
+	 * then applying f2 on each sub-data (ie. each column of C)
+	 */
 	starpu_map_filters(C_handle, 2, &f, &f2);
 }
 
@@ -176,35 +282,73 @@ static void launch_tasks(void)
 	/* partition the work into slices */
 	unsigned taskx, tasky;
 
+	/* the callback decrements this value every time a task is terminated
+	 * and notify the termination of the computation to the application
+	 * when the counter reaches 0 */
 	taskcounter = nslicesx * nslicesy;
 
 	starpu_codelet cl = {
+		/* we can only execute that kernel on a CPU yet */
 		.where = CORE,
+		/* CPU implementation of the codelet */
 		.core_func = cpu_mult,
+		/* the codelet manipulates 3 buffers that are managed by the
+ 		 * DSM */
 		.nbuffers = 3,
+		/* in case the scheduling policy may use performance models */
 		.model = &mult_perf_model
 	};
 
-
 	for (taskx = 0; taskx < nslicesx; taskx++) 
 	{
 		for (tasky = 0; tasky < nslicesy; tasky++)
 		{
-			/* A B[task] = C[task] */
+			/* C[taskx, tasky] = A[tasky] B[taskx] */
 			struct starpu_task *task = starpu_task_create();
 
+			/* this task implements codelet "cl" */
 			task->cl = &cl;
 
 			task->callback_func = callback_func;
 			task->callback_arg = &taskcounter;
 
+			/*
+			 *              |---|---|---|---|
+			 *              |   | * |   |   | B
+			 *              |---|---|---|---|
+			 *                    X 
+			 *     |----|   |---|---|---|---|
+			 *     |****| Y |   |***|   |   |
+			 *     |****|   |   |***|   |   |
+			 *     |----|   |---|---|---|---|
+			 *     |    |   |   |   |   |   |
+			 *     |    |   |   |   |   |   |
+			 *     |----|   |---|---|---|---|
+			 *       A              C
+			 */
+
+			/* there was a single filter applied to matrices A
+			 * (respectively C) so we grab the handle to the chunk
+			 * identified by "tasky" (respectively "taskx). The "1"
+			 * tells StarPU that there is a single argument to the
+			 * variable-arity function get_sub_data */
 			task->buffers[0].handle = get_sub_data(A_handle, 1, tasky);
 			task->buffers[0].mode = STARPU_R;
 			task->buffers[1].handle = get_sub_data(B_handle, 1, taskx);
 			task->buffers[1].mode = STARPU_R;
+
+			/* 2 filters were applied on matrix C, so we give
+			 * get_sub_data 2 arguments. The order of the arguments
+			 * must match the order in which the filters were
+			 * applied.
+			 * NB: get_sub_data(C_handle, 1, k) would have returned
+			 * a handle to the k-th column of matrix C.
+			 * NB2: get_sub_data(C_handle, 2, taskx, tasky) is
+			 * equivalent to
+			 * get_sub_data(get_sub_data(C_handle, 1, taskx), 1, tasky)*/
 			task->buffers[2].handle = 
 				get_sub_data(C_handle, 2, taskx, tasky);
-			task->buffers[2].mode = STARPU_RW;
+			task->buffers[2].mode = STARPU_W;
 
 			starpu_submit_task(task);
 		}
@@ -214,25 +358,39 @@ static void launch_tasks(void)
 int main(__attribute__ ((unused)) int argc, 
 	 __attribute__ ((unused)) char **argv)
 {
+	pthread_mutex_init(&mutex, NULL);
+	pthread_cond_init(&cond, NULL);
 
 	/* start the runtime */
 	starpu_init(NULL);
 
-	pthread_mutex_init(&mutex, NULL);
-	pthread_cond_init(&cond, NULL);
-
+	/* initialize matrices A, B and C and register them to StarPU */
 	init_problem_data();
 
+	/* partition matrices into blocks that can be manipulated by the
+ 	 * codelets */
 	partition_mult_data();
 
 	launch_tasks();
 
+	/* the different tasks are asynchronous so we use a callback to notify
+ 	 * the termination of the computation */
 	pthread_mutex_lock(&mutex);
 	if (!terminated)
 		pthread_cond_wait(&cond, &mutex);
 	pthread_mutex_unlock(&mutex);
 
+	/* remove the filters applied by the means of starpu_map_filters; now
+ 	 * it's not possible to manipulate a subset of C using get_sub_data until
+	 * starpu_map_filters is called again on C_handle.
+	 * The second argument is the memory node where the different subsets
+	 * should be reassembled, 0 = main memory (RAM) */
 	starpu_unpartition_data(C_handle, 0);
+
+	/* stop monitoring matrix C : after this, it is not possible to pass C 
+	 * (or any subset of C) as a codelet input/output. This also implements
+	 * a barrier so that the piece of data is put back into main memory in
+	 * case it was only available on a GPU for instance. */
 	starpu_delete_data(C_handle);
 	
 	starpu_shutdown();

+ 14 - 1
examples/basic-examples/vector-scal.c

@@ -35,10 +35,23 @@ static void scal_func(starpu_data_interface_t *buffers, void *arg)
 	unsigned i;
 	float *factor = arg;
 
+	/* 
+	 * The "buffers" array matches the task->buffers one: for instance
+	 * task->buffers[0].handle is a handle that corresponds to a data with
+	 * vector "interface". The starpu_data_interface_t is a union type with
+	 * a field for each interface defined. We therefore manipulate the
+	 * buffers[0].vector field: vector.nx gives the number of elements in
+	 * the array, vector.ptr gives the location of the array (that was
+	 * possibly migrated/replicated), and vector.elemsize gives the size of
+	 * each elements.
+	 */
+
 	/* length of the vector */
 	unsigned n = buffers[0].vector.nx;
 
-	/* get a pointer to the local copy of the vector */
+	/* get a pointer to the local copy of the vector : note that we have to
+	 * cast it in (float *) since a vector could contain any type of
+	 * elements so that the .ptr field is actually a uintptr_t */
 	float *val = (float *)buffers[0].vector.ptr;
 
 	/* scale the vector */